"""Codebase Quality Agent AI agent for analyzing overall codebase health, architecture, technical debt, and documentation coverage. """ import base64 import os from dataclasses import dataclass, field from agents.base_agent import AgentContext, AgentResult, BaseAgent @dataclass class CodebaseMetrics: """Metrics collected from codebase analysis.""" total_files: int = 0 total_lines: int = 0 languages: dict = field(default_factory=dict) todo_count: int = 0 fixme_count: int = 0 deprecated_count: int = 0 missing_docstrings: int = 0 @dataclass class CodebaseReport: """Complete codebase analysis report.""" summary: str health_score: float # 0-100 metrics: CodebaseMetrics issues: list[dict] recommendations: list[str] architecture_notes: list[str] class CodebaseAgent(BaseAgent): """Agent for codebase quality analysis.""" # Marker for codebase reports CODEBASE_AI_MARKER = "" # File extensions to analyze CODE_EXTENSIONS = { ".py": "Python", ".js": "JavaScript", ".ts": "TypeScript", ".go": "Go", ".rs": "Rust", ".java": "Java", ".rb": "Ruby", ".php": "PHP", ".c": "C", ".cpp": "C++", ".h": "C/C++ Header", ".cs": "C#", ".swift": "Swift", ".kt": "Kotlin", } # Files to ignore IGNORE_PATTERNS = [ "node_modules/", "vendor/", ".git/", "__pycache__/", ".venv/", "dist/", "build/", ".min.js", ".min.css", ] def can_handle(self, event_type: str, event_data: dict) -> bool: """Check if this agent handles the given event.""" agent_config = self.config.get("agents", {}).get("codebase", {}) if not agent_config.get("enabled", True): return False # Handle manual trigger via workflow_dispatch or schedule if event_type in ("workflow_dispatch", "schedule"): return True # Handle special issue command if event_type == "issue_comment": comment_body = event_data.get("comment", {}).get("body", "") mention_prefix = self.config.get("interaction", {}).get( "mention_prefix", "@ai-bot" ) if f"{mention_prefix} codebase" in comment_body.lower(): return True return False def execute(self, context: AgentContext) -> AgentResult: """Execute codebase analysis.""" self.logger.info(f"Starting codebase analysis for {context.owner}/{context.repo}") actions_taken = [] # Step 1: Collect file list from repository files = self._collect_files(context.owner, context.repo) self.logger.info(f"Found {len(files)} files to analyze") # Step 2: Analyze metrics metrics = self._analyze_metrics(context.owner, context.repo, files) actions_taken.append(f"Analyzed {metrics.total_files} files") # Step 3: Run AI analysis on key files report = self._run_ai_analysis(context, files, metrics) actions_taken.append("Generated AI analysis report") # Step 4: Create or update report issue issue_number = self._create_report_issue(context, report) actions_taken.append(f"Created/updated report issue #{issue_number}") return AgentResult( success=True, message=f"Codebase analysis complete - Health Score: {report.health_score:.0f}/100", data={ "health_score": report.health_score, "total_files": metrics.total_files, "issues_found": len(report.issues), }, actions_taken=actions_taken, ) def _collect_files(self, owner: str, repo: str) -> list[dict]: """Collect list of files from the repository.""" files = [] def traverse(path: str = ""): try: contents = self.gitea.get_file_contents(owner, repo, path or ".") if isinstance(contents, list): for item in contents: item_path = item.get("path", "") # Skip ignored patterns if any(p in item_path for p in self.IGNORE_PATTERNS): continue if item.get("type") == "file": ext = os.path.splitext(item_path)[1] if ext in self.CODE_EXTENSIONS: files.append(item) elif item.get("type") == "dir": traverse(item_path) except Exception as e: self.logger.warning(f"Failed to list {path}: {e}") traverse() return files[:100] # Limit to prevent API exhaustion def _analyze_metrics( self, owner: str, repo: str, files: list[dict], ) -> CodebaseMetrics: """Analyze metrics from files.""" metrics = CodebaseMetrics() metrics.total_files = len(files) for file_info in files[:50]: # Analyze top 50 files filepath = file_info.get("path", "") ext = os.path.splitext(filepath)[1] lang = self.CODE_EXTENSIONS.get(ext, "Unknown") metrics.languages[lang] = metrics.languages.get(lang, 0) + 1 try: content_data = self.gitea.get_file_contents(owner, repo, filepath) if content_data.get("content"): content = base64.b64decode(content_data["content"]).decode( "utf-8", errors="ignore" ) lines = content.splitlines() metrics.total_lines += len(lines) # Count markers for line in lines: line_upper = line.upper() if "TODO" in line_upper: metrics.todo_count += 1 if "FIXME" in line_upper: metrics.fixme_count += 1 if "DEPRECATED" in line_upper: metrics.deprecated_count += 1 # Check for docstrings (Python) if ext == ".py": if 'def ' in content and '"""' not in content: metrics.missing_docstrings += 1 except Exception as e: self.logger.debug(f"Could not analyze {filepath}: {e}") return metrics def _run_ai_analysis( self, context: AgentContext, files: list[dict], metrics: CodebaseMetrics, ) -> CodebaseReport: """Run AI analysis on the codebase.""" # Prepare context for AI file_list = "\n".join([f"- {f.get('path', '')}" for f in files[:30]]) language_breakdown = "\n".join( [f"- {lang}: {count} files" for lang, count in metrics.languages.items()] ) # Sample some key files for deeper analysis key_files_content = self._get_key_files_content( context.owner, context.repo, files ) prompt = f"""Analyze this codebase and provide a comprehensive quality assessment. ## Repository: {context.owner}/{context.repo} ## Metrics - Total Files: {metrics.total_files} - Total Lines: {metrics.total_lines} - TODO Comments: {metrics.todo_count} - FIXME Comments: {metrics.fixme_count} - Deprecated Markers: {metrics.deprecated_count} ## Language Breakdown {language_breakdown} ## File Structure (sample) {file_list} ## Key Files Content {key_files_content} ## Analysis Required Provide your analysis as JSON with this structure: ```json {{ "summary": "Overall assessment in 2-3 sentences", "health_score": 0-100, "issues": [ {{ "severity": "HIGH|MEDIUM|LOW", "category": "Architecture|Code Quality|Security|Testing|Documentation", "description": "Issue description", "recommendation": "How to fix" }} ], "recommendations": ["Top 3-5 actionable recommendations"], "architecture_notes": ["Observations about code structure and patterns"] }} ``` Be constructive and actionable. Focus on the most impactful improvements. """ try: result = self.call_llm_json(prompt) return CodebaseReport( summary=result.get("summary", "Analysis complete"), health_score=float(result.get("health_score", 50)), metrics=metrics, issues=result.get("issues", []), recommendations=result.get("recommendations", []), architecture_notes=result.get("architecture_notes", []), ) except Exception as e: self.logger.error(f"AI analysis failed: {e}") # Try to log the raw response if possible (requires accessing the last response) # Since we don't have direct access here, we rely on having good logging in LLMClient if needed. # But let's add a note to the summary. # Calculate basic health score from metrics health_score = 70 if metrics.todo_count > 10: health_score -= 10 if metrics.fixme_count > 5: health_score -= 10 return CodebaseReport( summary=f"Basic analysis complete (AI unavailable: {e})", health_score=health_score, metrics=metrics, issues=[], recommendations=["Manual review recommended"], architecture_notes=[], ) def _get_key_files_content( self, owner: str, repo: str, files: list[dict], ) -> str: """Get content of key files for AI analysis.""" key_file_names = [ "README.md", "setup.py", "pyproject.toml", "package.json", "Cargo.toml", "go.mod", "Makefile", "Dockerfile", ] content_parts = [] for file_info in files: filepath = file_info.get("path", "") filename = os.path.basename(filepath) if filename in key_file_names: try: content_data = self.gitea.get_file_contents(owner, repo, filepath) if content_data.get("content"): content = base64.b64decode(content_data["content"]).decode( "utf-8", errors="ignore" ) # Truncate long files if len(content) > 2000: content = content[:2000] + "\n... (truncated)" content_parts.append(f"### {filepath}\n```\n{content}\n```") except Exception: pass return "\n\n".join(content_parts[:5]) or "No key configuration files found." def _create_report_issue( self, context: AgentContext, report: CodebaseReport, ) -> int: """Create or update a report issue.""" # Generate issue body body = self._generate_report_body(report) # Look for existing report issue try: issues = self.gitea.list_issues( context.owner, context.repo, state="open", labels=["ai-codebase-report"] ) for issue in issues: if self.CODEBASE_AI_MARKER in issue.get("body", ""): # Update existing issue body self.gitea.update_issue( context.owner, context.repo, issue["number"], body=body, ) return issue["number"] except Exception as e: self.logger.warning(f"Failed to check for existing report: {e}") # Create new issue try: # Check for label ID labels = [] try: repo_labels = self.gitea.get_repo_labels(context.owner, context.repo) for label in repo_labels: if label["name"] == "ai-codebase-report": labels.append(label["id"]) break except Exception: pass issue = self.gitea.create_issue( context.owner, context.repo, title=f"AI Codebase Report - {context.repo}", body=body, labels=labels, ) return issue["number"] except Exception as e: self.logger.error(f"Failed to create report issue: {e}") return 0 def _generate_report_body(self, report: CodebaseReport) -> str: """Generate the report issue body.""" health_emoji = "🟢" if report.health_score >= 80 else ("🟡" if report.health_score >= 60 else "🔴") lines = [ f"{self.AI_DISCLAIMER}", "", "# AI Codebase Quality Report", "", f"## Health Score: {report.health_score:.0f}/100", "", report.summary, "", "---", "", "## Metrics", "", "| Metric | Value |", "|--------|-------|", f"| Total Files | {report.metrics.total_files} |", f"| Total Lines | {report.metrics.total_lines:,} |", f"| TODO Comments | {report.metrics.todo_count} |", f"| FIXME Comments | {report.metrics.fixme_count} |", f"| Deprecated | {report.metrics.deprecated_count} |", "", ] # Languages if report.metrics.languages: lines.append("### Languages") lines.append("") for lang, count in sorted( report.metrics.languages.items(), key=lambda x: -x[1] ): lines.append(f"- **{lang}**: {count} files") lines.append("") # Issues if report.issues: lines.append("## Issues Found") lines.append("") for issue in report.issues[:10]: severity = issue.get("severity", "MEDIUM") emoji = "🔴" if severity == "HIGH" else ("🟡" if severity == "MEDIUM" else "🟢") lines.append(f"### [{severity}] {issue.get('category', 'General')}") lines.append("") lines.append(issue.get("description", "")) lines.append("") lines.append(f"**Recommendation:** {issue.get('recommendation', '')}") lines.append("") # Recommendations if report.recommendations: lines.append("## Recommendations") lines.append("") for i, rec in enumerate(report.recommendations[:5], 1): lines.append(f"{i}. {rec}") lines.append("") # Architecture notes if report.architecture_notes: lines.append("## Architecture Notes") lines.append("") for note in report.architecture_notes[:5]: lines.append(f"- {note}") lines.append("") lines.append("---") lines.append(f"*Generated by AI Codebase Agent*") return "\n".join(lines)