Implement GuardDen Discord moderation bot

Features: - Core moderation: warn, kick, ban, timeout, strike system - Automod: banned words filter, scam detection, anti-spam, link filtering - AI moderation: Claude/OpenAI integration, NSFW detection, phishing analysis - Verification system: button, captcha, math, emoji challenges - Rate limiting system with configurable scopes - Event logging: joins, leaves, message edits/deletes, voice activity - Per-guild configuration with caching - Docker deployment support Bug fixes applied: - Fixed await on session.delete() in guild_config.py - Fixed memory leak in AI moderation message tracking (use deque) - Added error handling to bot shutdown - Added error handling to timeout command - Removed unused Literal import - Added prefix validation - Added image analysis limit (3 per message) - Fixed test mock for SQLAlchemy model
2026-01-16 19:27:48 +01:00
parent ffe42b6d51
commit 4e16777f25
45 changed files with 5802 additions and 1 deletions
--- a/src/guardden/services/ai/base.py
+++ b/src/guardden/services/ai/base.py
@@ -0,0 +1,149 @@
+"""Base classes for AI providers."""
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Literal
+
+
+class ContentCategory(str, Enum):
+    """Categories of problematic content."""
+
+    SAFE = "safe"
+    HARASSMENT = "harassment"
+    HATE_SPEECH = "hate_speech"
+    SEXUAL = "sexual"
+    VIOLENCE = "violence"
+    SELF_HARM = "self_harm"
+    SPAM = "spam"
+    SCAM = "scam"
+    MISINFORMATION = "misinformation"
+
+
+@dataclass
+class ModerationResult:
+    """Result of AI content moderation."""
+
+    is_flagged: bool = False
+    confidence: float = 0.0  # 0.0 to 1.0
+    categories: list[ContentCategory] = field(default_factory=list)
+    explanation: str = ""
+    suggested_action: Literal["none", "warn", "delete", "timeout", "ban"] = "none"
+
+    @property
+    def severity(self) -> int:
+        """Get severity score 0-100 based on confidence and categories."""
+        if not self.is_flagged:
+            return 0
+
+        # Base severity from confidence
+        severity = int(self.confidence * 50)
+
+        # Add severity based on category
+        high_severity = {
+            ContentCategory.HATE_SPEECH,
+            ContentCategory.SELF_HARM,
+            ContentCategory.SCAM,
+        }
+        medium_severity = {
+            ContentCategory.HARASSMENT,
+            ContentCategory.VIOLENCE,
+            ContentCategory.SEXUAL,
+        }
+
+        for cat in self.categories:
+            if cat in high_severity:
+                severity += 30
+            elif cat in medium_severity:
+                severity += 20
+            else:
+                severity += 10
+
+        return min(severity, 100)
+
+
+@dataclass
+class ImageAnalysisResult:
+    """Result of AI image analysis."""
+
+    is_nsfw: bool = False
+    is_violent: bool = False
+    is_disturbing: bool = False
+    confidence: float = 0.0
+    description: str = ""
+    categories: list[str] = field(default_factory=list)
+
+
+@dataclass
+class PhishingAnalysisResult:
+    """Result of AI phishing/scam analysis."""
+
+    is_phishing: bool = False
+    confidence: float = 0.0
+    risk_factors: list[str] = field(default_factory=list)
+    explanation: str = ""
+
+
+class AIProvider(ABC):
+    """Abstract base class for AI providers."""
+
+    @abstractmethod
+    async def moderate_text(
+        self,
+        content: str,
+        context: str | None = None,
+        sensitivity: int = 50,
+    ) -> ModerationResult:
+        """
+        Analyze text content for policy violations.
+
+        Args:
+            content: The text to analyze
+            context: Optional context about the conversation/server
+            sensitivity: 0-100, higher means more strict
+
+        Returns:
+            ModerationResult with analysis
+        """
+        pass
+
+    @abstractmethod
+    async def analyze_image(
+        self,
+        image_url: str,
+        sensitivity: int = 50,
+    ) -> ImageAnalysisResult:
+        """
+        Analyze an image for NSFW or inappropriate content.
+
+        Args:
+            image_url: URL of the image to analyze
+            sensitivity: 0-100, higher means more strict
+
+        Returns:
+            ImageAnalysisResult with analysis
+        """
+        pass
+
+    @abstractmethod
+    async def analyze_phishing(
+        self,
+        url: str,
+        message_content: str | None = None,
+    ) -> PhishingAnalysisResult:
+        """
+        Analyze a URL for phishing/scam indicators.
+
+        Args:
+            url: The URL to analyze
+            message_content: Optional full message for context
+
+        Returns:
+            PhishingAnalysisResult with analysis
+        """
+        pass
+
+    @abstractmethod
+    async def close(self) -> None:
+        """Clean up resources."""
+        pass