135 lines
3.5 KiB
Python
135 lines
3.5 KiB
Python
"""Security helpers for secret detection and untrusted content handling."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
from dataclasses import dataclass
|
|
from typing import Any
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class SecretMatch:
|
|
"""Represents a detected secret-like token."""
|
|
|
|
secret_type: str
|
|
value: str
|
|
|
|
|
|
_SECRET_PATTERNS: tuple[tuple[str, re.Pattern[str]], ...] = (
|
|
(
|
|
"openai_key",
|
|
re.compile(r"\bsk-[A-Za-z0-9_-]{20,}\b"),
|
|
),
|
|
(
|
|
"aws_access_key",
|
|
re.compile(r"\bAKIA[0-9A-Z]{16}\b"),
|
|
),
|
|
(
|
|
"github_token",
|
|
re.compile(r"\bgh[pousr]_[A-Za-z0-9]{20,}\b"),
|
|
),
|
|
(
|
|
"jwt",
|
|
re.compile(r"\beyJ[A-Za-z0-9_-]{8,}\.[A-Za-z0-9_-]{4,}\.[A-Za-z0-9_-]{4,}\b"),
|
|
),
|
|
(
|
|
"private_key",
|
|
re.compile(r"-----BEGIN (?:RSA |EC |OPENSSH |)PRIVATE KEY-----"),
|
|
),
|
|
(
|
|
"generic_api_key",
|
|
re.compile(r"\b(?:api[_-]?key|token)[\"'=: ]+[A-Za-z0-9_-]{16,}\b", re.IGNORECASE),
|
|
),
|
|
)
|
|
|
|
|
|
def detect_secrets(text: str) -> list[SecretMatch]:
|
|
"""Detect common secret patterns in text.
|
|
|
|
Args:
|
|
text: Untrusted text to scan.
|
|
|
|
Returns:
|
|
List of detected secret-like values.
|
|
"""
|
|
matches: list[SecretMatch] = []
|
|
for secret_type, pattern in _SECRET_PATTERNS:
|
|
for found in pattern.findall(text):
|
|
if isinstance(found, tuple):
|
|
candidate = "".join(found)
|
|
else:
|
|
candidate = found
|
|
matches.append(SecretMatch(secret_type=secret_type, value=candidate))
|
|
return matches
|
|
|
|
|
|
def mask_secret(value: str) -> str:
|
|
"""Mask a secret value while preserving minimal context.
|
|
|
|
Args:
|
|
value: Raw secret text.
|
|
|
|
Returns:
|
|
Masked string that does not reveal the secret.
|
|
"""
|
|
if len(value) <= 8:
|
|
return "[REDACTED]"
|
|
return f"{value[:4]}...{value[-4:]}"
|
|
|
|
|
|
def sanitize_data(value: Any, mode: str = "mask") -> Any:
|
|
"""Recursively sanitize secret-like material from arbitrary data.
|
|
|
|
Args:
|
|
value: Arbitrary response payload.
|
|
mode: `mask` to keep redacted content, `block` to fully replace fields.
|
|
|
|
Returns:
|
|
Sanitized payload value.
|
|
"""
|
|
if isinstance(value, dict):
|
|
return {str(key): sanitize_data(item, mode=mode) for key, item in value.items()}
|
|
|
|
if isinstance(value, list):
|
|
return [sanitize_data(item, mode=mode) for item in value]
|
|
|
|
if isinstance(value, tuple):
|
|
return tuple(sanitize_data(item, mode=mode) for item in value)
|
|
|
|
if isinstance(value, str):
|
|
findings = detect_secrets(value)
|
|
if not findings:
|
|
return value
|
|
|
|
if mode == "block":
|
|
return "[REDACTED_SECRET]"
|
|
|
|
masked = value
|
|
for finding in findings:
|
|
masked = masked.replace(finding.value, mask_secret(finding.value))
|
|
return masked
|
|
|
|
return value
|
|
|
|
|
|
def sanitize_untrusted_text(text: str, max_chars: int) -> str:
|
|
"""Normalize untrusted repository content for display-only usage.
|
|
|
|
Security note:
|
|
Repository content is always treated as data and never interpreted as
|
|
executable instructions. This helper enforces a strict length limit to
|
|
prevent prompt-stuffing through oversized payloads.
|
|
|
|
Args:
|
|
text: Repository text content.
|
|
max_chars: Maximum allowed characters in returned text.
|
|
|
|
Returns:
|
|
Truncated text safe for downstream display.
|
|
"""
|
|
if max_chars <= 0:
|
|
return ""
|
|
if len(text) <= max_chars:
|
|
return text
|
|
return text[:max_chars]
|