openrabbit/tools/ai-review/security/validate_workflows.py

#!/usr/bin/env python3
"""Pre-commit hook for validating workflow files.

Checks workflow files for security anti-patterns:
- Full webhook data in environment variables
- Missing input validation
- Unsafe shell operations
"""

import re
import sys
from pathlib import Path

import yaml

SECURITY_CHECKS = [
    {
        'name': 'Full webhook data in env vars',
        "name": "Full webhook data in env vars",
        "pattern": r"toJSON\(github\.event\)|toJSON\(gitea\.event\)",
        "severity": "HIGH",
        "message": "Do not pass full webhook data to environment variables. Use minimal extraction instead.",
    },
    {
        "name": "Unvalidated repository input",
        "pattern": r"\$\{\{\s*(?:github|gitea)\.repository\s*\}\}",
        "severity": "MEDIUM",
        "message": "Repository name should be validated before use. Add format validation.",
        "exclude_if": r"grep -qE.*repository",  # OK if validation present
    },
    {
        "name": "Direct user input in shell",
        "pattern": r"\$\{\{\s*(?:github|gitea)\.event\.comment\.body\s*\}\}",
        "severity": "MEDIUM",
        "message": "Comment body should be properly escaped. Use jq -Rs for JSON escaping.",
        "exclude_if": r"jq -Rs",  # OK if using jq for escaping
    },
    {
        "name": "Inline Python without validation",
        "pattern": r"python -c.*json\.loads\(os\.environ",
        "severity": "HIGH",
        "message": "Use utils/safe_dispatch.py instead of inline Python with env vars.",
    },
]


def check_workflow_file(filepath: str) -> list[dict]:
    """Check a workflow file for security issues.

    Args:
        filepath: Path to workflow YAML file

    Returns:
        List of findings
    """
    try:
        with open(filepath, "r", encoding="utf-8") as f:
            content = f.read()
    except Exception as e:
        return [{"severity": "ERROR", "message": f"Could not read file: {e}"}]

    # Parse YAML to ensure it's valid
    try:
        yaml.safe_load(content)
    except yaml.YAMLError as e:
        return [{"severity": "ERROR", "message": f"Invalid YAML: {e}"}]

    findings = []

    for check in SECURITY_CHECKS:
        pattern = check["pattern"]

        # Check if pattern is found
        matches = re.finditer(pattern, content, re.MULTILINE)

        for match in matches:
            # If there's an exclusion pattern, check if it's present
            if "exclude_if" in check:
                if re.search(check["exclude_if"], content):
                    continue  # Validation present, skip this finding

            # Find line number
            line_num = content[: match.start()].count("\n") + 1

            findings.append(
                {
                    "name": check["name"],
                    "severity": check["severity"],
                    "message": check["message"],
                    "line": line_num,
                    "match": match.group(0)[:80],  # First 80 chars
                }
            )

    return findings


def main():
    """Run workflow validation."""
    files = sys.argv[1:]

    if not files:
        print("No workflow files to validate")
        return 0

    has_high_severity = False
    total_findings = 0

    for filepath in files:
        findings = check_workflow_file(filepath)

        if not findings:
            continue

        total_findings += len(findings)

        print(f"\n{'=' * 60}")
        print(f"Workflow security issues in: {filepath}")
        print("=" * 60)

        for finding in findings:
            severity = finding.get("severity", "UNKNOWN")
            severity_symbol = {
                "HIGH": "🔴",
                "MEDIUM": "🟡",
                "LOW": "🔵",
                "ERROR": "❌",
            }.get(severity, "⚪")

            print(f"\n{severity_symbol} [{severity}] {finding.get('name', 'Issue')}")
            print(f"   Line: {finding.get('line', 'N/A')}")
            print(f"   {finding['message']}")

            if "match" in finding:
                print(f"   Match: {finding['match']}")

            if severity == "HIGH" or severity == "ERROR":
                has_high_severity = True

    if total_findings > 0:
        print(f"\n{'=' * 60}")
        print(f"Total findings: {total_findings}")
        print("=" * 60)

    if has_high_severity:
        print("\n❌ COMMIT BLOCKED: Critical workflow security issues found")
        print("Please fix the issues above before committing.")
        print("\nSee SECURITY.md for workflow security best practices.")
        return 1

    if total_findings > 0:
        print("\n⚠️  Medium severity issues found - review recommended")

    return 0


if __name__ == "__main__":