openrabbit/tools/ai-review/security/sast_scanner.py

"""SAST Scanner Integration

Integrates with external SAST tools like Bandit and Semgrep
to provide comprehensive security analysis.
"""

import json
import logging
import os
import shutil
import subprocess
import tempfile
from dataclasses import dataclass, field
from typing import Any

logger = logging.getLogger(__name__)


@dataclass
class SASTFinding:
    """A finding from a SAST tool."""

    tool: str
    rule_id: str
    severity: str  # CRITICAL, HIGH, MEDIUM, LOW
    file: str
    line: int
    message: str
    code_snippet: str | None = None
    cwe: str | None = None
    owasp: str | None = None
    fix_recommendation: str | None = None


@dataclass
class SASTReport:
    """Combined report from all SAST tools."""

    total_findings: int
    findings_by_severity: dict[str, int]
    findings_by_tool: dict[str, int]
    findings: list[SASTFinding]
    tools_run: list[str]
    errors: list[str] = field(default_factory=list)


class SASTScanner:
    """Aggregator for multiple SAST tools."""

    def __init__(self, config: dict | None = None):
        """Initialize the SAST scanner.

        Args:
            config: Configuration dictionary with tool settings.
        """
        self.config = config or {}
        self.logger = logging.getLogger(self.__class__.__name__)

    def scan_directory(self, path: str) -> SASTReport:
        """Scan a directory with all enabled SAST tools.

        Args:
            path: Path to the directory to scan.

        Returns:
            Combined SASTReport from all tools.
        """
        all_findings = []
        tools_run = []
        errors = []

        sast_config = self.config.get("security", {}).get("sast", {})

        # Run Bandit (Python)
        if sast_config.get("bandit", True):
            if self._is_tool_available("bandit"):
                try:
                    findings = self._run_bandit(path)
                    all_findings.extend(findings)
                    tools_run.append("bandit")
                except Exception as e:
                    errors.append(f"Bandit error: {e}")
            else:
                self.logger.debug("Bandit not installed, skipping")

        # Run Semgrep
        if sast_config.get("semgrep", True):
            if self._is_tool_available("semgrep"):
                try:
                    findings = self._run_semgrep(path)
                    all_findings.extend(findings)
                    tools_run.append("semgrep")
                except Exception as e:
                    errors.append(f"Semgrep error: {e}")
            else:
                self.logger.debug("Semgrep not installed, skipping")

        # Run Trivy (if enabled for filesystem scanning)
        if sast_config.get("trivy", False):
            if self._is_tool_available("trivy"):
                try:
                    findings = self._run_trivy(path)
                    all_findings.extend(findings)
                    tools_run.append("trivy")
                except Exception as e:
                    errors.append(f"Trivy error: {e}")
            else:
                self.logger.debug("Trivy not installed, skipping")

        # Calculate statistics
        by_severity = {"CRITICAL": 0, "HIGH": 0, "MEDIUM": 0, "LOW": 0}
        by_tool = {}

        for finding in all_findings:
            sev = finding.severity.upper()
            if sev in by_severity:
                by_severity[sev] += 1
            tool = finding.tool
            by_tool[tool] = by_tool.get(tool, 0) + 1

        return SASTReport(
            total_findings=len(all_findings),
            findings_by_severity=by_severity,
            findings_by_tool=by_tool,
            findings=all_findings,
            tools_run=tools_run,
            errors=errors,
        )

    def scan_content(self, content: str, filename: str) -> list[SASTFinding]:
        """Scan file content with SAST tools.

        Args:
            content: File content to scan.
            filename: Name of the file (for language detection).

        Returns:
            List of SASTFinding objects.
        """
        # Create temporary file for scanning
        with tempfile.NamedTemporaryFile(
            mode="w",
            suffix=os.path.splitext(filename)[1],
            delete=False,
        ) as f:
            f.write(content)
            temp_path = f.name

        try:
            report = self.scan_directory(os.path.dirname(temp_path))
            # Filter findings for our specific file
            findings = [
                f
                for f in report.findings
                if os.path.basename(f.file) == os.path.basename(temp_path)
            ]
            # Update file path to original filename
            for finding in findings:
                finding.file = filename
            return findings
        finally:
            os.unlink(temp_path)

    def scan_diff(self, diff: str) -> list[SASTFinding]:
        """Scan a diff for security issues.

        Only scans added/modified lines.

        Args:
            diff: Git diff content.

        Returns:
            List of SASTFinding objects.
        """
        findings = []

        # Parse diff and extract added content per file
        files_content = {}
        current_file = None
        current_content = []

        for line in diff.splitlines():
            if line.startswith("diff --git"):
                if current_file and current_content:
                    files_content[current_file] = "\n".join(current_content)
                current_file = None
                current_content = []
                # Extract filename
                match = line.split(" b/")
                if len(match) > 1:
                    current_file = match[1]
            elif line.startswith("+") and not line.startswith("+++"):
                if current_file:
                    current_content.append(line[1:])  # Remove + prefix

        # Don't forget last file
        if current_file and current_content:
            files_content[current_file] = "\n".join(current_content)

        # Scan each file's content
        for filename, content in files_content.items():
            if content.strip():
                file_findings = self.scan_content(content, filename)
                findings.extend(file_findings)

        return findings

    def _is_tool_available(self, tool: str) -> bool:
        """Check if a tool is installed and available."""
        return shutil.which(tool) is not None

    def _run_bandit(self, path: str) -> list[SASTFinding]:
        """Run Bandit security scanner.

        Args:
            path: Path to scan.

        Returns:
            List of SASTFinding objects.
        """
        findings = []

        try:
            result = subprocess.run(
                [
                    "bandit",
                    "-r",
                    path,
                    "-f",
                    "json",
                    "-ll",  # Only high and medium severity
                    "--quiet",
                ],
                capture_output=True,
                text=True,
                timeout=120,
            )

            if result.stdout:
                data = json.loads(result.stdout)

                for issue in data.get("results", []):
                    severity = issue.get("issue_severity", "MEDIUM").upper()

                    findings.append(
                        SASTFinding(
                            tool="bandit",
                            rule_id=issue.get("test_id", ""),
                            severity=severity,
                            file=issue.get("filename", ""),
                            line=issue.get("line_number", 0),
                            message=issue.get("issue_text", ""),
                            code_snippet=issue.get("code", ""),
                            cwe=f"CWE-{issue.get('issue_cwe', {}).get('id', '')}"
                            if issue.get("issue_cwe")
                            else None,
                            fix_recommendation=issue.get("more_info", ""),
                        )
                    )

        except subprocess.TimeoutExpired:
            self.logger.warning("Bandit scan timed out")
        except json.JSONDecodeError as e:
            self.logger.warning(f"Failed to parse Bandit output: {e}")
        except Exception as e:
            self.logger.warning(f"Bandit scan failed: {e}")

        return findings

    def _run_semgrep(self, path: str) -> list[SASTFinding]:
        """Run Semgrep security scanner.

        Args:
            path: Path to scan.

        Returns:
            List of SASTFinding objects.
        """
        findings = []

        # Get Semgrep config from settings
        sast_config = self.config.get("security", {}).get("sast", {})
        semgrep_rules = sast_config.get("semgrep_rules", "p/security-audit")

        try:
            result = subprocess.run(
                [
                    "semgrep",
                    "--config",
                    semgrep_rules,
                    "--json",
                    "--quiet",
                    path,
                ],
                capture_output=True,
                text=True,
                timeout=180,
            )

            if result.stdout:
                data = json.loads(result.stdout)

                for finding in data.get("results", []):
                    # Map Semgrep severity to our scale
                    sev_map = {
                        "ERROR": "HIGH",
                        "WARNING": "MEDIUM",
                        "INFO": "LOW",
                    }
                    severity = sev_map.get(
                        finding.get("extra", {}).get("severity", "WARNING"), "MEDIUM"
                    )

                    metadata = finding.get("extra", {}).get("metadata", {})

                    findings.append(
                        SASTFinding(
                            tool="semgrep",
                            rule_id=finding.get("check_id", ""),
                            severity=severity,
                            file=finding.get("path", ""),
                            line=finding.get("start", {}).get("line", 0),
                            message=finding.get("extra", {}).get("message", ""),
                            code_snippet=finding.get("extra", {}).get("lines", ""),
                            cwe=metadata.get("cwe", [None])[0]
                            if metadata.get("cwe")
                            else None,
                            owasp=metadata.get("owasp", [None])[0]
                            if metadata.get("owasp")
                            else None,
                            fix_recommendation=metadata.get("fix", ""),
                        )
                    )

        except subprocess.TimeoutExpired:
            self.logger.warning("Semgrep scan timed out")
        except json.JSONDecodeError as e:
            self.logger.warning(f"Failed to parse Semgrep output: {e}")
        except Exception as e:
            self.logger.warning(f"Semgrep scan failed: {e}")

        return findings

    def _run_trivy(self, path: str) -> list[SASTFinding]:
        """Run Trivy filesystem scanner.

        Args:
            path: Path to scan.

        Returns:
            List of SASTFinding objects.
        """
        findings = []

        try:
            result = subprocess.run(
                [
                    "trivy",
                    "fs",
                    "--format",
                    "json",
                    "--security-checks",
                    "vuln,secret,config",
                    path,
                ],
                capture_output=True,
                text=True,
                timeout=180,
            )

            if result.stdout:
                data = json.loads(result.stdout)

                for result_item in data.get("Results", []):
                    target = result_item.get("Target", "")

                    # Process vulnerabilities
                    for vuln in result_item.get("Vulnerabilities", []):
                        severity = vuln.get("Severity", "MEDIUM").upper()

                        findings.append(
                            SASTFinding(
                                tool="trivy",
                                rule_id=vuln.get("VulnerabilityID", ""),
                                severity=severity,
                                file=target,
                                line=0,
                                message=vuln.get("Title", ""),
                                cwe=vuln.get("CweIDs", [None])[0]
                                if vuln.get("CweIDs")
                                else None,
                                fix_recommendation=f"Upgrade to {vuln.get('FixedVersion', 'latest')}"
                                if vuln.get("FixedVersion")
                                else None,
                            )
                        )

                    # Process secrets
                    for secret in result_item.get("Secrets", []):
                        findings.append(
                            SASTFinding(
                                tool="trivy",
                                rule_id=secret.get("RuleID", ""),
                                severity="HIGH",
                                file=target,
                                line=secret.get("StartLine", 0),
                                message=f"Secret detected: {secret.get('Title', '')}",
                                code_snippet=secret.get("Match", ""),
                            )
                        )

        except subprocess.TimeoutExpired:
            self.logger.warning("Trivy scan timed out")
        except json.JSONDecodeError as e:
            self.logger.warning(f"Failed to parse Trivy output: {e}")
        except Exception as e:
            self.logger.warning(f"Trivy scan failed: {e}")

        return findings


def get_sast_scanner(config: dict | None = None) -> SASTScanner:
    """Get a configured SAST scanner instance.

    Args:
        config: Configuration dictionary.

    Returns:
        Configured SASTScanner instance.
    """
    return SASTScanner(config=config)