openrabbit/tools/ai-review/agents/codebase_agent.py

"""Codebase Quality Agent

AI agent for analyzing overall codebase health, architecture,
technical debt, and documentation coverage.
"""

import base64
import os
from dataclasses import dataclass, field

from agents.base_agent import AgentContext, AgentResult, BaseAgent


@dataclass
class CodebaseMetrics:
    """Metrics collected from codebase analysis."""

    total_files: int = 0
    total_lines: int = 0
    languages: dict = field(default_factory=dict)
    todo_count: int = 0
    fixme_count: int = 0
    deprecated_count: int = 0
    missing_docstrings: int = 0


@dataclass
class CodebaseReport:
    """Complete codebase analysis report."""

    summary: str
    health_score: float  # 0-100
    metrics: CodebaseMetrics
    issues: list[dict]
    recommendations: list[str]
    architecture_notes: list[str]


class CodebaseAgent(BaseAgent):
    """Agent for codebase quality analysis."""

    # Marker for codebase reports
    CODEBASE_AI_MARKER = "<!-- AI_CODEBASE_REVIEW -->"

    # File extensions to analyze
    CODE_EXTENSIONS = {
        ".py": "Python",
        ".js": "JavaScript",
        ".ts": "TypeScript",
        ".go": "Go",
        ".rs": "Rust",
        ".java": "Java",
        ".rb": "Ruby",
        ".php": "PHP",
        ".c": "C",
        ".cpp": "C++",
        ".h": "C/C++ Header",
        ".cs": "C#",
        ".swift": "Swift",
        ".kt": "Kotlin",
    }

    # Files to ignore
    IGNORE_PATTERNS = [
        "node_modules/",
        "vendor/",
        ".git/",
        "__pycache__/",
        ".venv/",
        "dist/",
        "build/",
        ".min.js",
        ".min.css",
    ]

    def can_handle(self, event_type: str, event_data: dict) -> bool:
        """Check if this agent handles the given event."""
        agent_config = self.config.get("agents", {}).get("codebase", {})
        if not agent_config.get("enabled", True):
            return False

        # Handle manual trigger via workflow_dispatch or schedule
        if event_type in ("workflow_dispatch", "schedule"):
            return True

        # Handle special issue command
        if event_type == "issue_comment":
            comment_body = event_data.get("comment", {}).get("body", "")
            mention_prefix = self.config.get("interaction", {}).get(
                "mention_prefix", "@ai-bot"
            )
            if f"{mention_prefix} codebase" in comment_body.lower():
                return True

        return False

    def execute(self, context: AgentContext) -> AgentResult:
        """Execute codebase analysis."""
        self.logger.info(f"Starting codebase analysis for {context.owner}/{context.repo}")

        actions_taken = []

        # Step 1: Collect file list from repository
        files = self._collect_files(context.owner, context.repo)
        self.logger.info(f"Found {len(files)} files to analyze")

        # Step 2: Analyze metrics
        metrics = self._analyze_metrics(context.owner, context.repo, files)
        actions_taken.append(f"Analyzed {metrics.total_files} files")

        # Step 3: Run AI analysis on key files
        report = self._run_ai_analysis(context, files, metrics)
        actions_taken.append("Generated AI analysis report")

        # Step 4: Create or update report issue
        issue_number = self._create_report_issue(context, report)
        actions_taken.append(f"Created/updated report issue #{issue_number}")

        return AgentResult(
            success=True,
            message=f"Codebase analysis complete - Health Score: {report.health_score:.0f}/100",
            data={
                "health_score": report.health_score,
                "total_files": metrics.total_files,
                "issues_found": len(report.issues),
            },
            actions_taken=actions_taken,
        )

    def _collect_files(self, owner: str, repo: str) -> list[dict]:
        """Collect list of files from the repository."""
        files = []

        def traverse(path: str = ""):
            try:
                contents = self.gitea.get_file_contents(owner, repo, path or ".")
                if isinstance(contents, list):
                    for item in contents:
                        item_path = item.get("path", "")

                        # Skip ignored patterns
                        if any(p in item_path for p in self.IGNORE_PATTERNS):
                            continue

                        if item.get("type") == "file":
                            ext = os.path.splitext(item_path)[1]
                            if ext in self.CODE_EXTENSIONS:
                                files.append(item)
                        elif item.get("type") == "dir":
                            traverse(item_path)
            except Exception as e:
                self.logger.warning(f"Failed to list {path}: {e}")

        traverse()
        return files[:100]  # Limit to prevent API exhaustion

    def _analyze_metrics(
        self,
        owner: str,
        repo: str,
        files: list[dict],
    ) -> CodebaseMetrics:
        """Analyze metrics from files."""
        metrics = CodebaseMetrics()
        metrics.total_files = len(files)

        for file_info in files[:50]:  # Analyze top 50 files
            filepath = file_info.get("path", "")
            ext = os.path.splitext(filepath)[1]
            lang = self.CODE_EXTENSIONS.get(ext, "Unknown")

            metrics.languages[lang] = metrics.languages.get(lang, 0) + 1

            try:
                content_data = self.gitea.get_file_contents(owner, repo, filepath)
                if content_data.get("content"):
                    content = base64.b64decode(content_data["content"]).decode(
                        "utf-8", errors="ignore"
                    )
                    lines = content.splitlines()
                    metrics.total_lines += len(lines)

                    # Count markers
                    for line in lines:
                        line_upper = line.upper()
                        if "TODO" in line_upper:
                            metrics.todo_count += 1
                        if "FIXME" in line_upper:
                            metrics.fixme_count += 1
                        if "DEPRECATED" in line_upper:
                            metrics.deprecated_count += 1

                    # Check for docstrings (Python)
                    if ext == ".py":
                        if 'def ' in content and '"""' not in content:
                            metrics.missing_docstrings += 1

            except Exception as e:
                self.logger.debug(f"Could not analyze {filepath}: {e}")

        return metrics

    def _run_ai_analysis(
        self,
        context: AgentContext,
        files: list[dict],
        metrics: CodebaseMetrics,
    ) -> CodebaseReport:
        """Run AI analysis on the codebase."""
        # Prepare context for AI
        file_list = "\n".join([f"- {f.get('path', '')}" for f in files[:30]])
        language_breakdown = "\n".join(
            [f"- {lang}: {count} files" for lang, count in metrics.languages.items()]
        )

        # Sample some key files for deeper analysis
        key_files_content = self._get_key_files_content(
            context.owner, context.repo, files
        )

        prompt = f"""Analyze this codebase and provide a comprehensive quality assessment.

## Repository: {context.owner}/{context.repo}

## Metrics
- Total Files: {metrics.total_files}
- Total Lines: {metrics.total_lines}
- TODO Comments: {metrics.todo_count}
- FIXME Comments: {metrics.fixme_count}
- Deprecated Markers: {metrics.deprecated_count}

## Language Breakdown
{language_breakdown}

## File Structure (sample)
{file_list}

## Key Files Content
{key_files_content}

## Analysis Required

Provide your analysis as JSON with this structure:
```json
{{
  "summary": "Overall assessment in 2-3 sentences",
  "health_score": 0-100,
  "issues": [
    {{
      "severity": "HIGH|MEDIUM|LOW",
      "category": "Architecture|Code Quality|Security|Testing|Documentation",
      "description": "Issue description",
      "recommendation": "How to fix"
    }}
  ],
  "recommendations": ["Top 3-5 actionable recommendations"],
  "architecture_notes": ["Observations about code structure and patterns"]
}}
```

Be constructive and actionable. Focus on the most impactful improvements.
"""

        try:
            result = self.call_llm_json(prompt)
            return CodebaseReport(
                summary=result.get("summary", "Analysis complete"),
                health_score=float(result.get("health_score", 50)),
                metrics=metrics,
                issues=result.get("issues", []),
                recommendations=result.get("recommendations", []),
                architecture_notes=result.get("architecture_notes", []),
            )
        except Exception as e:
            self.logger.error(f"AI analysis failed: {e}")
            # Try to log the raw response if possible (requires accessing the last response)
            # Since we don't have direct access here, we rely on having good logging in LLMClient if needed.
            # But let's add a note to the summary.
            # Calculate basic health score from metrics
            health_score = 70
            if metrics.todo_count > 10:
                health_score -= 10
            if metrics.fixme_count > 5:
                health_score -= 10

            return CodebaseReport(
                summary=f"Basic analysis complete (AI unavailable: {e})",
                health_score=health_score,
                metrics=metrics,
                issues=[],
                recommendations=["Manual review recommended"],
                architecture_notes=[],
            )

    def _get_key_files_content(
        self,
        owner: str,
        repo: str,
        files: list[dict],
    ) -> str:
        """Get content of key files for AI analysis."""
        key_file_names = [
            "README.md",
            "setup.py",
            "pyproject.toml",
            "package.json",
            "Cargo.toml",
            "go.mod",
            "Makefile",
            "Dockerfile",
        ]

        content_parts = []

        for file_info in files:
            filepath = file_info.get("path", "")
            filename = os.path.basename(filepath)

            if filename in key_file_names:
                try:
                    content_data = self.gitea.get_file_contents(owner, repo, filepath)
                    if content_data.get("content"):
                        content = base64.b64decode(content_data["content"]).decode(
                            "utf-8", errors="ignore"
                        )
                        # Truncate long files
                        if len(content) > 2000:
                            content = content[:2000] + "\n... (truncated)"
                        content_parts.append(f"### {filepath}\n```\n{content}\n```")
                except Exception:
                    pass

        return "\n\n".join(content_parts[:5]) or "No key configuration files found."

    def _create_report_issue(
        self,
        context: AgentContext,
        report: CodebaseReport,
    ) -> int:
        """Create or update a report issue."""
        # Generate issue body
        body = self._generate_report_body(report)

        # Look for existing report issue
        try:
            issues = self.gitea.list_issues(
                context.owner, context.repo, state="open", labels=["ai-codebase-report"]
            )
            for issue in issues:
                if self.CODEBASE_AI_MARKER in issue.get("body", ""):
                    # Update existing issue body
                    self.gitea.update_issue(
                        context.owner,
                        context.repo,
                        issue["number"],
                        body=body,
                    )
                    return issue["number"]
        except Exception as e:
            self.logger.warning(f"Failed to check for existing report: {e}")

        # Create new issue
        try:
            # Check for label ID
            labels = []
            try:
                repo_labels = self.gitea.get_repo_labels(context.owner, context.repo)
                for label in repo_labels:
                    if label["name"] == "ai-codebase-report":
                        labels.append(label["id"])
                        break
            except Exception:
                pass

            issue = self.gitea.create_issue(
                context.owner,
                context.repo,
                title=f"AI Codebase Report - {context.repo}",
                body=body,
                labels=labels,
            )
            return issue["number"]
        except Exception as e:
            self.logger.error(f"Failed to create report issue: {e}")
            return 0

    def _generate_report_body(self, report: CodebaseReport) -> str:
        """Generate the report issue body."""
        health_emoji = "🟢" if report.health_score >= 80 else ("🟡" if report.health_score >= 60 else "🔴")

        lines = [
            f"{self.AI_DISCLAIMER}",
            "",
            "# AI Codebase Quality Report",
            "",
            f"## Health Score: {report.health_score:.0f}/100",
            "",
            report.summary,
            "",
            "---",
            "",
            "## Metrics",
            "",
            "| Metric | Value |",
            "|--------|-------|",
            f"| Total Files | {report.metrics.total_files} |",
            f"| Total Lines | {report.metrics.total_lines:,} |",
            f"| TODO Comments | {report.metrics.todo_count} |",
            f"| FIXME Comments | {report.metrics.fixme_count} |",
            f"| Deprecated | {report.metrics.deprecated_count} |",
            "",
        ]

        # Languages
        if report.metrics.languages:
            lines.append("### Languages")
            lines.append("")
            for lang, count in sorted(
                report.metrics.languages.items(), key=lambda x: -x[1]
            ):
                lines.append(f"- **{lang}**: {count} files")
            lines.append("")

        # Issues
        if report.issues:
            lines.append("## Issues Found")
            lines.append("")
            for issue in report.issues[:10]:
                severity = issue.get("severity", "MEDIUM")
                emoji = "🔴" if severity == "HIGH" else ("🟡" if severity == "MEDIUM" else "🟢")
                lines.append(f"### [{severity}] {issue.get('category', 'General')}")
                lines.append("")
                lines.append(issue.get("description", ""))
                lines.append("")
                lines.append(f"**Recommendation:** {issue.get('recommendation', '')}")
                lines.append("")

        # Recommendations
        if report.recommendations:
            lines.append("## Recommendations")
            lines.append("")
            for i, rec in enumerate(report.recommendations[:5], 1):
                lines.append(f"{i}. {rec}")
            lines.append("")

        # Architecture notes
        if report.architecture_notes:
            lines.append("## Architecture Notes")
            lines.append("")
            for note in report.architecture_notes[:5]:
                lines.append(f"- {note}")
            lines.append("")

        lines.append("---")
        lines.append(f"*Generated by AI Codebase Agent*")

        return "\n".join(lines)