improve accuracy

2026-01-24 17:37:09 +01:00
parent 136ae04388
commit a5811113f0
4 changed files with 143 additions and 22 deletions
--- a/src/guardden/cogs/ai_moderation.py
+++ b/src/guardden/cogs/ai_moderation.py
@@ -16,6 +16,17 @@ from guardden.utils.ratelimit import RateLimitExceeded
 logger = logging.getLogger(__name__)
 def _get_action_for_nsfw(category: str) -> str:
    """Map NSFW category to suggested action."""
    mapping = {
        "suggestive": "warn",
        "partial_nudity": "delete",
        "nudity": "delete",
        "explicit": "timeout",
    }
    return mapping.get(category, "none")
 class AIModeration(commands.Cog):
    """AI-powered content moderation."""
@@ -300,7 +311,8 @@ class AIModeration(commands.Cog):
                        sensitivity=config.ai_sensitivity,
                    )
                    logger.info(
-                        f"Image result: nsfw={image_result.is_nsfw}, violent={image_result.is_violent}, conf={image_result.confidence}"
+                        f"Image result: nsfw={image_result.is_nsfw}, category={image_result.nsfw_category}, "
                        f"severity={image_result.nsfw_severity}, violent={image_result.is_violent}, conf={image_result.confidence}"
                    )
                    if (
@@ -315,12 +327,23 @@ class AIModeration(commands.Cog):
                        if image_result.is_violent:
                            categories.append(ContentCategory.VIOLENCE)
                        # Use nsfw_severity if available, otherwise use None for default calculation
                        severity_override = (
                            image_result.nsfw_severity if image_result.nsfw_severity > 0 else None
                        )
                        # Include NSFW category in explanation for better logging
                        explanation = image_result.description
                        if image_result.nsfw_category and image_result.nsfw_category != "none":
                            explanation = f"[{image_result.nsfw_category}] {explanation}"
                        result = ModerationResult(
                            is_flagged=True,
                            confidence=image_result.confidence,
                            categories=categories,
-                            explanation=image_result.description,
+                            explanation=explanation,
-                            suggested_action="delete",
+                            suggested_action=_get_action_for_nsfw(image_result.nsfw_category),
                            severity_override=severity_override,
                        )
                        await self._handle_ai_result(message, result, "Image Analysis")
                        return
@@ -346,7 +369,8 @@ class AIModeration(commands.Cog):
                        sensitivity=config.ai_sensitivity,
                    )
                    logger.info(
-                        f"Embed image result: nsfw={image_result.is_nsfw}, violent={image_result.is_violent}, conf={image_result.confidence}"
+                        f"Embed image result: nsfw={image_result.is_nsfw}, category={image_result.nsfw_category}, "
                        f"severity={image_result.nsfw_severity}, violent={image_result.is_violent}, conf={image_result.confidence}"
                    )
                    if (
@@ -360,12 +384,23 @@ class AIModeration(commands.Cog):
                        if image_result.is_violent:
                            categories.append(ContentCategory.VIOLENCE)
                        # Use nsfw_severity if available, otherwise use None for default calculation
                        severity_override = (
                            image_result.nsfw_severity if image_result.nsfw_severity > 0 else None
                        )
                        # Include NSFW category in explanation for better logging
                        explanation = image_result.description
                        if image_result.nsfw_category and image_result.nsfw_category != "none":
                            explanation = f"[{image_result.nsfw_category}] {explanation}"
                        result = ModerationResult(
                            is_flagged=True,
                            confidence=image_result.confidence,
                            categories=categories,
-                            explanation=image_result.description,
+                            explanation=explanation,
-                            suggested_action="delete",
+                            suggested_action=_get_action_for_nsfw(image_result.nsfw_category),
                            severity_override=severity_override,
                        )
                        await self._handle_ai_result(message, result, "Image Analysis")
                        return
--- a/src/guardden/services/ai/anthropic_provider.py
+++ b/src/guardden/services/ai/anthropic_provider.py
@@ -38,23 +38,43 @@ Respond in this exact JSON format:
 Be balanced - flag genuinely problematic content but allow normal conversation, jokes, and mild language. Consider context."""
-IMAGE_ANALYSIS_PROMPT = """Analyze this image for content moderation purposes. Check for:
+IMAGE_ANALYSIS_PROMPT = """Analyze this image for content moderation. Classify NSFW content with precision.
- NSFW content (nudity, sexual content)
+
- Violence or gore
+NSFW CATEGORIES (in order of severity):
- Disturbing or shocking content
+- none: Safe content, no sexual elements
- Any content inappropriate for a general audience
+- suggestive: Revealing clothing, provocative poses, cleavage, but no nudity
 - partial_nudity: Underwear visible, partial exposure, censored nudity
 - nudity: Full nudity visible (breasts, genitals, buttocks) but no sexual acts
 - explicit: Sexual acts, pornographic content, genitalia in sexual context
 DETECTION CRITERIA:
 - Focus on human bodies; anime/artwork should be judged by same standards
 - Swimwear in beach/pool context = suggestive at most
 - Artistic nudity (classical art, sculptures) = nudity but note in description
 - Medical/educational content = note context in description
 ALSO CHECK FOR:
 - Violence or gore (blood, injuries, weapons used violently)
 - Disturbing content (shock imagery, extreme content)
 Respond in this exact JSON format:
 {
    "is_nsfw": true/false,
    "nsfw_category": "none|suggestive|partial_nudity|nudity|explicit",
    "nsfw_severity": 0-100,
    "is_violent": true/false,
    "is_disturbing": true/false,
    "confidence": 0.0-1.0,
-    "description": "Brief description of the image",
+    "description": "Brief description including context",
    "categories": ["category1", "category2"]
 }
-Be accurate but not overly sensitive - artistic nudity or mild violence in appropriate contexts may be acceptable."""
+NSFW SEVERITY GUIDELINES:
 - none: 0
 - suggestive: 20-35
 - partial_nudity: 40-55
 - nudity: 60-75
 - explicit: 80-100"""
 PHISHING_ANALYSIS_PROMPT = """Analyze this URL and message context for phishing or scam indicators.
@@ -97,6 +117,7 @@ class AnthropicProvider(AIProvider):
    async def _call_api(self, system: str, user_content: Any, max_tokens: int = 500) -> str:
        """Make an API call to Claude."""
        async def _request() -> str:
            message = await self.client.messages.create(
                model=self.model,
@@ -180,11 +201,29 @@ class AnthropicProvider(AIProvider):
        import aiohttp
        sensitivity_note = ""
        if sensitivity < 30:
-            sensitivity_note = "\n\nBe lenient - only flag explicit content."
+            sensitivity_note = """
 SENSITIVITY: LENIENT
 - Allow suggestive content (swimwear, revealing clothing)
 - Only flag partial_nudity and above as NSFW
 - Consider artistic/educational context favorably
 - Set is_nsfw=false for suggestive content"""
        elif sensitivity > 70:
-            sensitivity_note = "\n\nBe strict - flag suggestive content as well."
+            sensitivity_note = """
 SENSITIVITY: STRICT
 - Flag suggestive content as NSFW (is_nsfw=true)
 - No tolerance for any nudity regardless of context
 - Provocative poses should be flagged
 - Lower threshold for nsfw_severity scores"""
        else:
            sensitivity_note = """
 SENSITIVITY: BALANCED
 - Allow normal swimwear/fashion (is_nsfw=false for suggestive)
 - Flag partial_nudity and above as NSFW
 - Consider context for edge cases"""
        system = IMAGE_ANALYSIS_PROMPT + sensitivity_note
@@ -229,6 +268,8 @@ class AnthropicProvider(AIProvider):
                confidence=float(data.get("confidence", 0.0)),
                description=data.get("description", ""),
                categories=data.get("categories", []),
                nsfw_category=data.get("nsfw_category", "none"),
                nsfw_severity=int(data.get("nsfw_severity", 0)),
            )
        except Exception as e:
--- a/src/guardden/services/ai/base.py
+++ b/src/guardden/services/ai/base.py
@@ -23,6 +23,16 @@ class ContentCategory(str, Enum):
    MISINFORMATION = "misinformation"
 class NSFWCategory(str, Enum):
    """NSFW content subcategories with increasing severity."""
    NONE = "none"
    SUGGESTIVE = "suggestive"  # Revealing clothing, provocative poses
    PARTIAL_NUDITY = "partial_nudity"  # Partial exposure, underwear
    NUDITY = "nudity"  # Full nudity without sexual acts
    EXPLICIT = "explicit"  # Sexual acts, pornographic content
 _T = TypeVar("_T")
@@ -90,6 +100,7 @@ class ModerationResult:
    categories: list[ContentCategory] = field(default_factory=list)
    explanation: str = ""
    suggested_action: Literal["none", "warn", "delete", "timeout", "ban"] = "none"
    severity_override: int | None = None  # Direct severity for NSFW images
    @property
    def severity(self) -> int:
@@ -97,6 +108,10 @@ class ModerationResult:
        if not self.is_flagged:
            return 0
        # Use override if provided (e.g., from NSFW image analysis)
        if self.severity_override is not None:
            return min(self.severity_override, 100)
        # Base severity from confidence
        severity = int(self.confidence * 50)
@@ -133,6 +148,8 @@ class ImageAnalysisResult:
    confidence: float = 0.0
    description: str = ""
    categories: list[str] = field(default_factory=list)
    nsfw_category: str = "none"  # Maps to NSFWCategory values
    nsfw_severity: int = 0  # 0-100 specific NSFW severity score
@dataclass
--- a/src/guardden/services/ai/openai_provider.py
+++ b/src/guardden/services/ai/openai_provider.py
@@ -42,6 +42,7 @@ class OpenAIProvider(AIProvider):
        max_tokens: int = 500,
    ) -> str:
        """Make an API call to OpenAI."""
        async def _request() -> str:
            response = await self.client.chat.completions.create(
                model=self.model,
@@ -79,6 +80,7 @@ class OpenAIProvider(AIProvider):
        """Analyze text content for policy violations."""
        # First, use OpenAI's built-in moderation API for quick check
        try:
            async def _moderate() -> Any:
                return await self.client.moderations.create(input=content)
@@ -140,23 +142,47 @@ class OpenAIProvider(AIProvider):
        sensitivity: int = 50,
    ) -> ImageAnalysisResult:
        """Analyze an image for NSFW or inappropriate content."""
-        system = """Analyze this image for content moderation. Respond in JSON format:
+        system = """Analyze this image for content moderation. Classify NSFW content with precision.
 NSFW CATEGORIES (in order of severity):
 - none: Safe content, no sexual elements
 - suggestive: Revealing clothing, provocative poses, cleavage, but no nudity
 - partial_nudity: Underwear visible, partial exposure, censored nudity
 - nudity: Full nudity visible (breasts, genitals, buttocks) but no sexual acts
 - explicit: Sexual acts, pornographic content, genitalia in sexual context
 DETECTION CRITERIA:
 - Focus on human bodies; anime/artwork should be judged by same standards
 - Swimwear in beach/pool context = suggestive at most
 - Artistic nudity (classical art, sculptures) = nudity but note in description
 ALSO CHECK FOR:
 - Violence or gore (blood, injuries, weapons used violently)
 - Disturbing content (shock imagery, extreme content)
 Respond in JSON format:
 {
    "is_nsfw": true/false,
    "nsfw_category": "none|suggestive|partial_nudity|nudity|explicit",
    "nsfw_severity": 0-100,
    "is_violent": true/false,
    "is_disturbing": true/false,
    "confidence": 0.0-1.0,
-    "description": "Brief description",
+    "description": "Brief description including context",
    "categories": ["category1"]
-}"""
+}
 NSFW SEVERITY GUIDELINES: none=0, suggestive=20-35, partial_nudity=40-55, nudity=60-75, explicit=80-100"""
        sensitivity_note = ""
        if sensitivity < 30:
-            sensitivity_note = " Be lenient - only flag explicit content."
+            sensitivity_note = " SENSITIVITY: LENIENT - Allow suggestive content, only flag partial_nudity and above, set is_nsfw=false for suggestive."
        elif sensitivity > 70:
-            sensitivity_note = " Be strict - flag suggestive content."
+            sensitivity_note = " SENSITIVITY: STRICT - Flag suggestive content as NSFW, no tolerance for any nudity regardless of context."
        else:
            sensitivity_note = " SENSITIVITY: BALANCED - Allow normal swimwear/fashion, flag partial_nudity and above."
        try:
            async def _request() -> Any:
                return await self.client.chat.completions.create(
                    model="gpt-4o-mini",  # Use vision-capable model
@@ -189,6 +215,8 @@ class OpenAIProvider(AIProvider):
                confidence=float(data.get("confidence", 0.0)),
                description=data.get("description", ""),
                categories=data.get("categories", []),
                nsfw_category=data.get("nsfw_category", "none"),
                nsfw_severity=int(data.get("nsfw_severity", 0)),
            )
        except Exception as e: