first commit

This commit is contained in:
2025-12-21 13:42:30 +01:00
parent 823b825acb
commit f9b24fe248
47 changed files with 8222 additions and 1 deletions

View File

@@ -0,0 +1,457 @@
"""Codebase Quality Agent
AI agent for analyzing overall codebase health, architecture,
technical debt, and documentation coverage.
"""
import base64
import os
from dataclasses import dataclass, field
from agents.base_agent import AgentContext, AgentResult, BaseAgent
@dataclass
class CodebaseMetrics:
"""Metrics collected from codebase analysis."""
total_files: int = 0
total_lines: int = 0
languages: dict = field(default_factory=dict)
todo_count: int = 0
fixme_count: int = 0
deprecated_count: int = 0
missing_docstrings: int = 0
@dataclass
class CodebaseReport:
"""Complete codebase analysis report."""
summary: str
health_score: float # 0-100
metrics: CodebaseMetrics
issues: list[dict]
recommendations: list[str]
architecture_notes: list[str]
class CodebaseAgent(BaseAgent):
"""Agent for codebase quality analysis."""
# Marker for codebase reports
CODEBASE_AI_MARKER = "<!-- AI_CODEBASE_REVIEW -->"
# File extensions to analyze
CODE_EXTENSIONS = {
".py": "Python",
".js": "JavaScript",
".ts": "TypeScript",
".go": "Go",
".rs": "Rust",
".java": "Java",
".rb": "Ruby",
".php": "PHP",
".c": "C",
".cpp": "C++",
".h": "C/C++ Header",
".cs": "C#",
".swift": "Swift",
".kt": "Kotlin",
}
# Files to ignore
IGNORE_PATTERNS = [
"node_modules/",
"vendor/",
".git/",
"__pycache__/",
".venv/",
"dist/",
"build/",
".min.js",
".min.css",
]
def can_handle(self, event_type: str, event_data: dict) -> bool:
"""Check if this agent handles the given event."""
agent_config = self.config.get("agents", {}).get("codebase", {})
if not agent_config.get("enabled", True):
return False
# Handle manual trigger via workflow_dispatch or schedule
if event_type in ("workflow_dispatch", "schedule"):
return True
# Handle special issue command
if event_type == "issue_comment":
comment_body = event_data.get("comment", {}).get("body", "")
mention_prefix = self.config.get("interaction", {}).get(
"mention_prefix", "@ai-bot"
)
if f"{mention_prefix} codebase" in comment_body.lower():
return True
return False
def execute(self, context: AgentContext) -> AgentResult:
"""Execute codebase analysis."""
self.logger.info(f"Starting codebase analysis for {context.owner}/{context.repo}")
actions_taken = []
# Step 1: Collect file list from repository
files = self._collect_files(context.owner, context.repo)
self.logger.info(f"Found {len(files)} files to analyze")
# Step 2: Analyze metrics
metrics = self._analyze_metrics(context.owner, context.repo, files)
actions_taken.append(f"Analyzed {metrics.total_files} files")
# Step 3: Run AI analysis on key files
report = self._run_ai_analysis(context, files, metrics)
actions_taken.append("Generated AI analysis report")
# Step 4: Create or update report issue
issue_number = self._create_report_issue(context, report)
actions_taken.append(f"Created/updated report issue #{issue_number}")
return AgentResult(
success=True,
message=f"Codebase analysis complete - Health Score: {report.health_score:.0f}/100",
data={
"health_score": report.health_score,
"total_files": metrics.total_files,
"issues_found": len(report.issues),
},
actions_taken=actions_taken,
)
def _collect_files(self, owner: str, repo: str) -> list[dict]:
"""Collect list of files from the repository."""
files = []
def traverse(path: str = ""):
try:
contents = self.gitea.get_file_contents(owner, repo, path or ".")
if isinstance(contents, list):
for item in contents:
item_path = item.get("path", "")
# Skip ignored patterns
if any(p in item_path for p in self.IGNORE_PATTERNS):
continue
if item.get("type") == "file":
ext = os.path.splitext(item_path)[1]
if ext in self.CODE_EXTENSIONS:
files.append(item)
elif item.get("type") == "dir":
traverse(item_path)
except Exception as e:
self.logger.warning(f"Failed to list {path}: {e}")
traverse()
return files[:100] # Limit to prevent API exhaustion
def _analyze_metrics(
self,
owner: str,
repo: str,
files: list[dict],
) -> CodebaseMetrics:
"""Analyze metrics from files."""
metrics = CodebaseMetrics()
metrics.total_files = len(files)
for file_info in files[:50]: # Analyze top 50 files
filepath = file_info.get("path", "")
ext = os.path.splitext(filepath)[1]
lang = self.CODE_EXTENSIONS.get(ext, "Unknown")
metrics.languages[lang] = metrics.languages.get(lang, 0) + 1
try:
content_data = self.gitea.get_file_contents(owner, repo, filepath)
if content_data.get("content"):
content = base64.b64decode(content_data["content"]).decode(
"utf-8", errors="ignore"
)
lines = content.splitlines()
metrics.total_lines += len(lines)
# Count markers
for line in lines:
line_upper = line.upper()
if "TODO" in line_upper:
metrics.todo_count += 1
if "FIXME" in line_upper:
metrics.fixme_count += 1
if "DEPRECATED" in line_upper:
metrics.deprecated_count += 1
# Check for docstrings (Python)
if ext == ".py":
if 'def ' in content and '"""' not in content:
metrics.missing_docstrings += 1
except Exception as e:
self.logger.debug(f"Could not analyze {filepath}: {e}")
return metrics
def _run_ai_analysis(
self,
context: AgentContext,
files: list[dict],
metrics: CodebaseMetrics,
) -> CodebaseReport:
"""Run AI analysis on the codebase."""
# Prepare context for AI
file_list = "\n".join([f"- {f.get('path', '')}" for f in files[:30]])
language_breakdown = "\n".join(
[f"- {lang}: {count} files" for lang, count in metrics.languages.items()]
)
# Sample some key files for deeper analysis
key_files_content = self._get_key_files_content(
context.owner, context.repo, files
)
prompt = f"""Analyze this codebase and provide a comprehensive quality assessment.
## Repository: {context.owner}/{context.repo}
## Metrics
- Total Files: {metrics.total_files}
- Total Lines: {metrics.total_lines}
- TODO Comments: {metrics.todo_count}
- FIXME Comments: {metrics.fixme_count}
- Deprecated Markers: {metrics.deprecated_count}
## Language Breakdown
{language_breakdown}
## File Structure (sample)
{file_list}
## Key Files Content
{key_files_content}
## Analysis Required
Provide your analysis as JSON with this structure:
```json
{{
"summary": "Overall assessment in 2-3 sentences",
"health_score": 0-100,
"issues": [
{{
"severity": "HIGH|MEDIUM|LOW",
"category": "Architecture|Code Quality|Security|Testing|Documentation",
"description": "Issue description",
"recommendation": "How to fix"
}}
],
"recommendations": ["Top 3-5 actionable recommendations"],
"architecture_notes": ["Observations about code structure and patterns"]
}}
```
Be constructive and actionable. Focus on the most impactful improvements.
"""
try:
result = self.call_llm_json(prompt)
return CodebaseReport(
summary=result.get("summary", "Analysis complete"),
health_score=float(result.get("health_score", 50)),
metrics=metrics,
issues=result.get("issues", []),
recommendations=result.get("recommendations", []),
architecture_notes=result.get("architecture_notes", []),
)
except Exception as e:
self.logger.error(f"AI analysis failed: {e}")
# Try to log the raw response if possible (requires accessing the last response)
# Since we don't have direct access here, we rely on having good logging in LLMClient if needed.
# But let's add a note to the summary.
# Calculate basic health score from metrics
health_score = 70
if metrics.todo_count > 10:
health_score -= 10
if metrics.fixme_count > 5:
health_score -= 10
return CodebaseReport(
summary=f"Basic analysis complete (AI unavailable: {e})",
health_score=health_score,
metrics=metrics,
issues=[],
recommendations=["Manual review recommended"],
architecture_notes=[],
)
def _get_key_files_content(
self,
owner: str,
repo: str,
files: list[dict],
) -> str:
"""Get content of key files for AI analysis."""
key_file_names = [
"README.md",
"setup.py",
"pyproject.toml",
"package.json",
"Cargo.toml",
"go.mod",
"Makefile",
"Dockerfile",
]
content_parts = []
for file_info in files:
filepath = file_info.get("path", "")
filename = os.path.basename(filepath)
if filename in key_file_names:
try:
content_data = self.gitea.get_file_contents(owner, repo, filepath)
if content_data.get("content"):
content = base64.b64decode(content_data["content"]).decode(
"utf-8", errors="ignore"
)
# Truncate long files
if len(content) > 2000:
content = content[:2000] + "\n... (truncated)"
content_parts.append(f"### {filepath}\n```\n{content}\n```")
except Exception:
pass
return "\n\n".join(content_parts[:5]) or "No key configuration files found."
def _create_report_issue(
self,
context: AgentContext,
report: CodebaseReport,
) -> int:
"""Create or update a report issue."""
# Generate issue body
body = self._generate_report_body(report)
# Look for existing report issue
try:
issues = self.gitea.list_issues(
context.owner, context.repo, state="open", labels=["ai-codebase-report"]
)
for issue in issues:
if self.CODEBASE_AI_MARKER in issue.get("body", ""):
# Update existing issue body
self.gitea.update_issue(
context.owner,
context.repo,
issue["number"],
body=body,
)
return issue["number"]
except Exception as e:
self.logger.warning(f"Failed to check for existing report: {e}")
# Create new issue
try:
# Check for label ID
labels = []
try:
repo_labels = self.gitea.get_repo_labels(context.owner, context.repo)
for label in repo_labels:
if label["name"] == "ai-codebase-report":
labels.append(label["id"])
break
except Exception:
pass
issue = self.gitea.create_issue(
context.owner,
context.repo,
title=f"AI Codebase Report - {context.repo}",
body=body,
labels=labels,
)
return issue["number"]
except Exception as e:
self.logger.error(f"Failed to create report issue: {e}")
return 0
def _generate_report_body(self, report: CodebaseReport) -> str:
"""Generate the report issue body."""
health_emoji = "🟢" if report.health_score >= 80 else ("🟡" if report.health_score >= 60 else "🔴")
lines = [
f"{self.AI_DISCLAIMER}",
"",
"# AI Codebase Quality Report",
"",
f"## Health Score: {report.health_score:.0f}/100",
"",
report.summary,
"",
"---",
"",
"## Metrics",
"",
"| Metric | Value |",
"|--------|-------|",
f"| Total Files | {report.metrics.total_files} |",
f"| Total Lines | {report.metrics.total_lines:,} |",
f"| TODO Comments | {report.metrics.todo_count} |",
f"| FIXME Comments | {report.metrics.fixme_count} |",
f"| Deprecated | {report.metrics.deprecated_count} |",
"",
]
# Languages
if report.metrics.languages:
lines.append("### Languages")
lines.append("")
for lang, count in sorted(
report.metrics.languages.items(), key=lambda x: -x[1]
):
lines.append(f"- **{lang}**: {count} files")
lines.append("")
# Issues
if report.issues:
lines.append("## Issues Found")
lines.append("")
for issue in report.issues[:10]:
severity = issue.get("severity", "MEDIUM")
emoji = "🔴" if severity == "HIGH" else ("🟡" if severity == "MEDIUM" else "🟢")
lines.append(f"### [{severity}] {issue.get('category', 'General')}")
lines.append("")
lines.append(issue.get("description", ""))
lines.append("")
lines.append(f"**Recommendation:** {issue.get('recommendation', '')}")
lines.append("")
# Recommendations
if report.recommendations:
lines.append("## Recommendations")
lines.append("")
for i, rec in enumerate(report.recommendations[:5], 1):
lines.append(f"{i}. {rec}")
lines.append("")
# Architecture notes
if report.architecture_notes:
lines.append("## Architecture Notes")
lines.append("")
for note in report.architecture_notes[:5]:
lines.append(f"- {note}")
lines.append("")
lines.append("---")
lines.append(f"*Generated by AI Codebase Agent*")
return "\n".join(lines)