improve accuracy

2026-01-24 17:37:09 +01:00
parent 136ae04388
commit a5811113f0
4 changed files with 143 additions and 22 deletions
--- a/src/guardden/cogs/ai_moderation.py
+++ b/src/guardden/cogs/ai_moderation.py
@@ -16,6 +16,17 @@ from guardden.utils.ratelimit import RateLimitExceeded
 logger = logging.getLogger(__name__)


+def _get_action_for_nsfw(category: str) -> str:
+    """Map NSFW category to suggested action."""
+    mapping = {
+        "suggestive": "warn",
+        "partial_nudity": "delete",
+        "nudity": "delete",
+        "explicit": "timeout",
+    }
+    return mapping.get(category, "none")
+
+
 class AIModeration(commands.Cog):
    """AI-powered content moderation."""

@@ -300,7 +311,8 @@ class AIModeration(commands.Cog):
                        sensitivity=config.ai_sensitivity,
                    )
                    logger.info(
-                        f"Image result: nsfw={image_result.is_nsfw}, violent={image_result.is_violent}, conf={image_result.confidence}"
+                        f"Image result: nsfw={image_result.is_nsfw}, category={image_result.nsfw_category}, "
+                        f"severity={image_result.nsfw_severity}, violent={image_result.is_violent}, conf={image_result.confidence}"
                    )

                    if (
@@ -315,12 +327,23 @@ class AIModeration(commands.Cog):
                        if image_result.is_violent:
                            categories.append(ContentCategory.VIOLENCE)

+                        # Use nsfw_severity if available, otherwise use None for default calculation
+                        severity_override = (
+                            image_result.nsfw_severity if image_result.nsfw_severity > 0 else None
+                        )
+
+                        # Include NSFW category in explanation for better logging
+                        explanation = image_result.description
+                        if image_result.nsfw_category and image_result.nsfw_category != "none":
+                            explanation = f"[{image_result.nsfw_category}] {explanation}"
+
                        result = ModerationResult(
                            is_flagged=True,
                            confidence=image_result.confidence,
                            categories=categories,
-                            explanation=image_result.description,
-                            suggested_action="delete",
+                            explanation=explanation,
+                            suggested_action=_get_action_for_nsfw(image_result.nsfw_category),
+                            severity_override=severity_override,
                        )
                        await self._handle_ai_result(message, result, "Image Analysis")
                        return
@@ -346,7 +369,8 @@ class AIModeration(commands.Cog):
                        sensitivity=config.ai_sensitivity,
                    )
                    logger.info(
-                        f"Embed image result: nsfw={image_result.is_nsfw}, violent={image_result.is_violent}, conf={image_result.confidence}"
+                        f"Embed image result: nsfw={image_result.is_nsfw}, category={image_result.nsfw_category}, "
+                        f"severity={image_result.nsfw_severity}, violent={image_result.is_violent}, conf={image_result.confidence}"
                    )

                    if (
@@ -360,12 +384,23 @@ class AIModeration(commands.Cog):
                        if image_result.is_violent:
                            categories.append(ContentCategory.VIOLENCE)

+                        # Use nsfw_severity if available, otherwise use None for default calculation
+                        severity_override = (
+                            image_result.nsfw_severity if image_result.nsfw_severity > 0 else None
+                        )
+
+                        # Include NSFW category in explanation for better logging
+                        explanation = image_result.description
+                        if image_result.nsfw_category and image_result.nsfw_category != "none":
+                            explanation = f"[{image_result.nsfw_category}] {explanation}"
+
                        result = ModerationResult(
                            is_flagged=True,
                            confidence=image_result.confidence,
                            categories=categories,
-                            explanation=image_result.description,
-                            suggested_action="delete",
+                            explanation=explanation,
+                            suggested_action=_get_action_for_nsfw(image_result.nsfw_category),
+                            severity_override=severity_override,
                        )
                        await self._handle_ai_result(message, result, "Image Analysis")
                        return
--- a/src/guardden/services/ai/anthropic_provider.py
+++ b/src/guardden/services/ai/anthropic_provider.py
@@ -38,23 +38,43 @@ Respond in this exact JSON format:

 Be balanced - flag genuinely problematic content but allow normal conversation, jokes, and mild language. Consider context."""

-IMAGE_ANALYSIS_PROMPT = """Analyze this image for content moderation purposes. Check for:
- NSFW content (nudity, sexual content)
- Violence or gore
- Disturbing or shocking content
- Any content inappropriate for a general audience
+IMAGE_ANALYSIS_PROMPT = """Analyze this image for content moderation. Classify NSFW content with precision.
+
+NSFW CATEGORIES (in order of severity):
+- none: Safe content, no sexual elements
+- suggestive: Revealing clothing, provocative poses, cleavage, but no nudity
+- partial_nudity: Underwear visible, partial exposure, censored nudity
+- nudity: Full nudity visible (breasts, genitals, buttocks) but no sexual acts
+- explicit: Sexual acts, pornographic content, genitalia in sexual context
+
+DETECTION CRITERIA:
+- Focus on human bodies; anime/artwork should be judged by same standards
+- Swimwear in beach/pool context = suggestive at most
+- Artistic nudity (classical art, sculptures) = nudity but note in description
+- Medical/educational content = note context in description
+
+ALSO CHECK FOR:
+- Violence or gore (blood, injuries, weapons used violently)
+- Disturbing content (shock imagery, extreme content)

 Respond in this exact JSON format:
 {
    "is_nsfw": true/false,
+    "nsfw_category": "none|suggestive|partial_nudity|nudity|explicit",
+    "nsfw_severity": 0-100,
    "is_violent": true/false,
    "is_disturbing": true/false,
    "confidence": 0.0-1.0,
-    "description": "Brief description of the image",
+    "description": "Brief description including context",
    "categories": ["category1", "category2"]
 }

-Be accurate but not overly sensitive - artistic nudity or mild violence in appropriate contexts may be acceptable."""
+NSFW SEVERITY GUIDELINES:
+- none: 0
+- suggestive: 20-35
+- partial_nudity: 40-55
+- nudity: 60-75
+- explicit: 80-100"""

 PHISHING_ANALYSIS_PROMPT = """Analyze this URL and message context for phishing or scam indicators.

@@ -97,6 +117,7 @@ class AnthropicProvider(AIProvider):

    async def _call_api(self, system: str, user_content: Any, max_tokens: int = 500) -> str:
        """Make an API call to Claude."""
+
        async def _request() -> str:
            message = await self.client.messages.create(
                model=self.model,
@@ -180,11 +201,29 @@ class AnthropicProvider(AIProvider):

        import aiohttp

-        sensitivity_note = ""
        if sensitivity < 30:
-            sensitivity_note = "\n\nBe lenient - only flag explicit content."
+            sensitivity_note = """
+
+SENSITIVITY: LENIENT
+- Allow suggestive content (swimwear, revealing clothing)
+- Only flag partial_nudity and above as NSFW
+- Consider artistic/educational context favorably
+- Set is_nsfw=false for suggestive content"""
        elif sensitivity > 70:
-            sensitivity_note = "\n\nBe strict - flag suggestive content as well."
+            sensitivity_note = """
+
+SENSITIVITY: STRICT
+- Flag suggestive content as NSFW (is_nsfw=true)
+- No tolerance for any nudity regardless of context
+- Provocative poses should be flagged
+- Lower threshold for nsfw_severity scores"""
+        else:
+            sensitivity_note = """
+
+SENSITIVITY: BALANCED
+- Allow normal swimwear/fashion (is_nsfw=false for suggestive)
+- Flag partial_nudity and above as NSFW
+- Consider context for edge cases"""

        system = IMAGE_ANALYSIS_PROMPT + sensitivity_note

@@ -229,6 +268,8 @@ class AnthropicProvider(AIProvider):
                confidence=float(data.get("confidence", 0.0)),
                description=data.get("description", ""),
                categories=data.get("categories", []),
+                nsfw_category=data.get("nsfw_category", "none"),
+                nsfw_severity=int(data.get("nsfw_severity", 0)),
            )

        except Exception as e:
--- a/src/guardden/services/ai/base.py
+++ b/src/guardden/services/ai/base.py
@@ -23,6 +23,16 @@ class ContentCategory(str, Enum):
    MISINFORMATION = "misinformation"


+class NSFWCategory(str, Enum):
+    """NSFW content subcategories with increasing severity."""
+
+    NONE = "none"
+    SUGGESTIVE = "suggestive"  # Revealing clothing, provocative poses
+    PARTIAL_NUDITY = "partial_nudity"  # Partial exposure, underwear
+    NUDITY = "nudity"  # Full nudity without sexual acts
+    EXPLICIT = "explicit"  # Sexual acts, pornographic content
+
+
 _T = TypeVar("_T")


@@ -90,6 +100,7 @@ class ModerationResult:
    categories: list[ContentCategory] = field(default_factory=list)
    explanation: str = ""
    suggested_action: Literal["none", "warn", "delete", "timeout", "ban"] = "none"
+    severity_override: int | None = None  # Direct severity for NSFW images

    @property
    def severity(self) -> int:
@@ -97,6 +108,10 @@ class ModerationResult:
        if not self.is_flagged:
            return 0

+        # Use override if provided (e.g., from NSFW image analysis)
+        if self.severity_override is not None:
+            return min(self.severity_override, 100)
+
        # Base severity from confidence
        severity = int(self.confidence * 50)

@@ -133,6 +148,8 @@ class ImageAnalysisResult:
    confidence: float = 0.0
    description: str = ""
    categories: list[str] = field(default_factory=list)
+    nsfw_category: str = "none"  # Maps to NSFWCategory values
+    nsfw_severity: int = 0  # 0-100 specific NSFW severity score


@dataclass
--- a/src/guardden/services/ai/openai_provider.py
+++ b/src/guardden/services/ai/openai_provider.py
@@ -42,6 +42,7 @@ class OpenAIProvider(AIProvider):
        max_tokens: int = 500,
    ) -> str:
        """Make an API call to OpenAI."""
+
        async def _request() -> str:
            response = await self.client.chat.completions.create(
                model=self.model,
@@ -79,6 +80,7 @@ class OpenAIProvider(AIProvider):
        """Analyze text content for policy violations."""
        # First, use OpenAI's built-in moderation API for quick check
        try:
+
            async def _moderate() -> Any:
                return await self.client.moderations.create(input=content)

@@ -140,23 +142,47 @@ class OpenAIProvider(AIProvider):
        sensitivity: int = 50,
    ) -> ImageAnalysisResult:
        """Analyze an image for NSFW or inappropriate content."""
-        system = """Analyze this image for content moderation. Respond in JSON format:
+        system = """Analyze this image for content moderation. Classify NSFW content with precision.
+
+NSFW CATEGORIES (in order of severity):
+- none: Safe content, no sexual elements
+- suggestive: Revealing clothing, provocative poses, cleavage, but no nudity
+- partial_nudity: Underwear visible, partial exposure, censored nudity
+- nudity: Full nudity visible (breasts, genitals, buttocks) but no sexual acts
+- explicit: Sexual acts, pornographic content, genitalia in sexual context
+
+DETECTION CRITERIA:
+- Focus on human bodies; anime/artwork should be judged by same standards
+- Swimwear in beach/pool context = suggestive at most
+- Artistic nudity (classical art, sculptures) = nudity but note in description
+
+ALSO CHECK FOR:
+- Violence or gore (blood, injuries, weapons used violently)
+- Disturbing content (shock imagery, extreme content)
+
+Respond in JSON format:
 {
    "is_nsfw": true/false,
+    "nsfw_category": "none|suggestive|partial_nudity|nudity|explicit",
+    "nsfw_severity": 0-100,
    "is_violent": true/false,
    "is_disturbing": true/false,
    "confidence": 0.0-1.0,
-    "description": "Brief description",
+    "description": "Brief description including context",
    "categories": ["category1"]
-}"""
+}
+
+NSFW SEVERITY GUIDELINES: none=0, suggestive=20-35, partial_nudity=40-55, nudity=60-75, explicit=80-100"""

-        sensitivity_note = ""
        if sensitivity < 30:
-            sensitivity_note = " Be lenient - only flag explicit content."
+            sensitivity_note = " SENSITIVITY: LENIENT - Allow suggestive content, only flag partial_nudity and above, set is_nsfw=false for suggestive."
        elif sensitivity > 70:
-            sensitivity_note = " Be strict - flag suggestive content."
+            sensitivity_note = " SENSITIVITY: STRICT - Flag suggestive content as NSFW, no tolerance for any nudity regardless of context."
+        else:
+            sensitivity_note = " SENSITIVITY: BALANCED - Allow normal swimwear/fashion, flag partial_nudity and above."

        try:
+
            async def _request() -> Any:
                return await self.client.chat.completions.create(
                    model="gpt-4o-mini",  # Use vision-capable model
@@ -189,6 +215,8 @@ class OpenAIProvider(AIProvider):
                confidence=float(data.get("confidence", 0.0)),
                description=data.get("description", ""),
                categories=data.get("categories", []),
+                nsfw_category=data.get("nsfw_category", "none"),
+                nsfw_severity=int(data.get("nsfw_severity", 0)),
            )

        except Exception as e: