From a5811113f00d478762878af7508c66797b50b23e Mon Sep 17 00:00:00 2001
From: latte <latte@hiddenden.cafe>
Date: Sat, 24 Jan 2026 17:37:09 +0100
Subject: [PATCH] improve accuracy

---
 src/guardden/cogs/ai_moderation.py            | 47 ++++++++++++--
 .../services/ai/anthropic_provider.py         | 61 ++++++++++++++++---
 src/guardden/services/ai/base.py              | 17 ++++++
 src/guardden/services/ai/openai_provider.py   | 40 ++++++++++--
 4 files changed, 143 insertions(+), 22 deletions(-)

diff --git a/src/guardden/cogs/ai_moderation.py b/src/guardden/cogs/ai_moderation.py
index 64a50a6..0554edd 100644
--- a/src/guardden/cogs/ai_moderation.py
+++ b/src/guardden/cogs/ai_moderation.py
@@ -16,6 +16,17 @@ from guardden.utils.ratelimit import RateLimitExceeded
 logger = logging.getLogger(__name__)
 
 
+def _get_action_for_nsfw(category: str) -> str:
+    """Map NSFW category to suggested action."""
+    mapping = {
+        "suggestive": "warn",
+        "partial_nudity": "delete",
+        "nudity": "delete",
+        "explicit": "timeout",
+    }
+    return mapping.get(category, "none")
+
+
 class AIModeration(commands.Cog):
     """AI-powered content moderation."""
 
@@ -300,7 +311,8 @@ class AIModeration(commands.Cog):
                         sensitivity=config.ai_sensitivity,
                     )
                     logger.info(
-                        f"Image result: nsfw={image_result.is_nsfw}, violent={image_result.is_violent}, conf={image_result.confidence}"
+                        f"Image result: nsfw={image_result.is_nsfw}, category={image_result.nsfw_category}, "
+                        f"severity={image_result.nsfw_severity}, violent={image_result.is_violent}, conf={image_result.confidence}"
                     )
 
                     if (
@@ -315,12 +327,23 @@ class AIModeration(commands.Cog):
                         if image_result.is_violent:
                             categories.append(ContentCategory.VIOLENCE)
 
+                        # Use nsfw_severity if available, otherwise use None for default calculation
+                        severity_override = (
+                            image_result.nsfw_severity if image_result.nsfw_severity > 0 else None
+                        )
+
+                        # Include NSFW category in explanation for better logging
+                        explanation = image_result.description
+                        if image_result.nsfw_category and image_result.nsfw_category != "none":
+                            explanation = f"[{image_result.nsfw_category}] {explanation}"
+
                         result = ModerationResult(
                             is_flagged=True,
                             confidence=image_result.confidence,
                             categories=categories,
-                            explanation=image_result.description,
-                            suggested_action="delete",
+                            explanation=explanation,
+                            suggested_action=_get_action_for_nsfw(image_result.nsfw_category),
+                            severity_override=severity_override,
                         )
                         await self._handle_ai_result(message, result, "Image Analysis")
                         return
@@ -346,7 +369,8 @@ class AIModeration(commands.Cog):
                         sensitivity=config.ai_sensitivity,
                     )
                     logger.info(
-                        f"Embed image result: nsfw={image_result.is_nsfw}, violent={image_result.is_violent}, conf={image_result.confidence}"
+                        f"Embed image result: nsfw={image_result.is_nsfw}, category={image_result.nsfw_category}, "
+                        f"severity={image_result.nsfw_severity}, violent={image_result.is_violent}, conf={image_result.confidence}"
                     )
 
                     if (
@@ -360,12 +384,23 @@ class AIModeration(commands.Cog):
                         if image_result.is_violent:
                             categories.append(ContentCategory.VIOLENCE)
 
+                        # Use nsfw_severity if available, otherwise use None for default calculation
+                        severity_override = (
+                            image_result.nsfw_severity if image_result.nsfw_severity > 0 else None
+                        )
+
+                        # Include NSFW category in explanation for better logging
+                        explanation = image_result.description
+                        if image_result.nsfw_category and image_result.nsfw_category != "none":
+                            explanation = f"[{image_result.nsfw_category}] {explanation}"
+
                         result = ModerationResult(
                             is_flagged=True,
                             confidence=image_result.confidence,
                             categories=categories,
-                            explanation=image_result.description,
-                            suggested_action="delete",
+                            explanation=explanation,
+                            suggested_action=_get_action_for_nsfw(image_result.nsfw_category),
+                            severity_override=severity_override,
                         )
                         await self._handle_ai_result(message, result, "Image Analysis")
                         return
diff --git a/src/guardden/services/ai/anthropic_provider.py b/src/guardden/services/ai/anthropic_provider.py
index 4b100c5..e729bf2 100644
--- a/src/guardden/services/ai/anthropic_provider.py
+++ b/src/guardden/services/ai/anthropic_provider.py
@@ -38,23 +38,43 @@ Respond in this exact JSON format:
 
 Be balanced - flag genuinely problematic content but allow normal conversation, jokes, and mild language. Consider context."""
 
-IMAGE_ANALYSIS_PROMPT = """Analyze this image for content moderation purposes. Check for:
-- NSFW content (nudity, sexual content)
-- Violence or gore
-- Disturbing or shocking content
-- Any content inappropriate for a general audience
+IMAGE_ANALYSIS_PROMPT = """Analyze this image for content moderation. Classify NSFW content with precision.
+
+NSFW CATEGORIES (in order of severity):
+- none: Safe content, no sexual elements
+- suggestive: Revealing clothing, provocative poses, cleavage, but no nudity
+- partial_nudity: Underwear visible, partial exposure, censored nudity
+- nudity: Full nudity visible (breasts, genitals, buttocks) but no sexual acts
+- explicit: Sexual acts, pornographic content, genitalia in sexual context
+
+DETECTION CRITERIA:
+- Focus on human bodies; anime/artwork should be judged by same standards
+- Swimwear in beach/pool context = suggestive at most
+- Artistic nudity (classical art, sculptures) = nudity but note in description
+- Medical/educational content = note context in description
+
+ALSO CHECK FOR:
+- Violence or gore (blood, injuries, weapons used violently)
+- Disturbing content (shock imagery, extreme content)
 
 Respond in this exact JSON format:
 {
     "is_nsfw": true/false,
+    "nsfw_category": "none|suggestive|partial_nudity|nudity|explicit",
+    "nsfw_severity": 0-100,
     "is_violent": true/false,
     "is_disturbing": true/false,
     "confidence": 0.0-1.0,
-    "description": "Brief description of the image",
+    "description": "Brief description including context",
     "categories": ["category1", "category2"]
 }
 
-Be accurate but not overly sensitive - artistic nudity or mild violence in appropriate contexts may be acceptable."""
+NSFW SEVERITY GUIDELINES:
+- none: 0
+- suggestive: 20-35
+- partial_nudity: 40-55
+- nudity: 60-75
+- explicit: 80-100"""
 
 PHISHING_ANALYSIS_PROMPT = """Analyze this URL and message context for phishing or scam indicators.
 
@@ -97,6 +117,7 @@ class AnthropicProvider(AIProvider):
 
     async def _call_api(self, system: str, user_content: Any, max_tokens: int = 500) -> str:
         """Make an API call to Claude."""
+
         async def _request() -> str:
             message = await self.client.messages.create(
                 model=self.model,
@@ -180,11 +201,29 @@ class AnthropicProvider(AIProvider):
 
         import aiohttp
 
-        sensitivity_note = ""
         if sensitivity < 30:
-            sensitivity_note = "\n\nBe lenient - only flag explicit content."
+            sensitivity_note = """
+
+SENSITIVITY: LENIENT
+- Allow suggestive content (swimwear, revealing clothing)
+- Only flag partial_nudity and above as NSFW
+- Consider artistic/educational context favorably
+- Set is_nsfw=false for suggestive content"""
         elif sensitivity > 70:
-            sensitivity_note = "\n\nBe strict - flag suggestive content as well."
+            sensitivity_note = """
+
+SENSITIVITY: STRICT
+- Flag suggestive content as NSFW (is_nsfw=true)
+- No tolerance for any nudity regardless of context
+- Provocative poses should be flagged
+- Lower threshold for nsfw_severity scores"""
+        else:
+            sensitivity_note = """
+
+SENSITIVITY: BALANCED
+- Allow normal swimwear/fashion (is_nsfw=false for suggestive)
+- Flag partial_nudity and above as NSFW
+- Consider context for edge cases"""
 
         system = IMAGE_ANALYSIS_PROMPT + sensitivity_note
 
@@ -229,6 +268,8 @@ class AnthropicProvider(AIProvider):
                 confidence=float(data.get("confidence", 0.0)),
                 description=data.get("description", ""),
                 categories=data.get("categories", []),
+                nsfw_category=data.get("nsfw_category", "none"),
+                nsfw_severity=int(data.get("nsfw_severity", 0)),
             )
 
         except Exception as e:
diff --git a/src/guardden/services/ai/base.py b/src/guardden/services/ai/base.py
index 40e6752..29bd789 100644
--- a/src/guardden/services/ai/base.py
+++ b/src/guardden/services/ai/base.py
@@ -23,6 +23,16 @@ class ContentCategory(str, Enum):
     MISINFORMATION = "misinformation"
 
 
+class NSFWCategory(str, Enum):
+    """NSFW content subcategories with increasing severity."""
+
+    NONE = "none"
+    SUGGESTIVE = "suggestive"  # Revealing clothing, provocative poses
+    PARTIAL_NUDITY = "partial_nudity"  # Partial exposure, underwear
+    NUDITY = "nudity"  # Full nudity without sexual acts
+    EXPLICIT = "explicit"  # Sexual acts, pornographic content
+
+
 _T = TypeVar("_T")
 
 
@@ -90,6 +100,7 @@ class ModerationResult:
     categories: list[ContentCategory] = field(default_factory=list)
     explanation: str = ""
     suggested_action: Literal["none", "warn", "delete", "timeout", "ban"] = "none"
+    severity_override: int | None = None  # Direct severity for NSFW images
 
     @property
     def severity(self) -> int:
@@ -97,6 +108,10 @@ class ModerationResult:
         if not self.is_flagged:
             return 0
 
+        # Use override if provided (e.g., from NSFW image analysis)
+        if self.severity_override is not None:
+            return min(self.severity_override, 100)
+
         # Base severity from confidence
         severity = int(self.confidence * 50)
 
@@ -133,6 +148,8 @@ class ImageAnalysisResult:
     confidence: float = 0.0
     description: str = ""
     categories: list[str] = field(default_factory=list)
+    nsfw_category: str = "none"  # Maps to NSFWCategory values
+    nsfw_severity: int = 0  # 0-100 specific NSFW severity score
 
 
 @dataclass
diff --git a/src/guardden/services/ai/openai_provider.py b/src/guardden/services/ai/openai_provider.py
index a82cbcc..103a2b1 100644
--- a/src/guardden/services/ai/openai_provider.py
+++ b/src/guardden/services/ai/openai_provider.py
@@ -42,6 +42,7 @@ class OpenAIProvider(AIProvider):
         max_tokens: int = 500,
     ) -> str:
         """Make an API call to OpenAI."""
+
         async def _request() -> str:
             response = await self.client.chat.completions.create(
                 model=self.model,
@@ -79,6 +80,7 @@ class OpenAIProvider(AIProvider):
         """Analyze text content for policy violations."""
         # First, use OpenAI's built-in moderation API for quick check
         try:
+
             async def _moderate() -> Any:
                 return await self.client.moderations.create(input=content)
 
@@ -140,23 +142,47 @@ class OpenAIProvider(AIProvider):
         sensitivity: int = 50,
     ) -> ImageAnalysisResult:
         """Analyze an image for NSFW or inappropriate content."""
-        system = """Analyze this image for content moderation. Respond in JSON format:
+        system = """Analyze this image for content moderation. Classify NSFW content with precision.
+
+NSFW CATEGORIES (in order of severity):
+- none: Safe content, no sexual elements
+- suggestive: Revealing clothing, provocative poses, cleavage, but no nudity
+- partial_nudity: Underwear visible, partial exposure, censored nudity
+- nudity: Full nudity visible (breasts, genitals, buttocks) but no sexual acts
+- explicit: Sexual acts, pornographic content, genitalia in sexual context
+
+DETECTION CRITERIA:
+- Focus on human bodies; anime/artwork should be judged by same standards
+- Swimwear in beach/pool context = suggestive at most
+- Artistic nudity (classical art, sculptures) = nudity but note in description
+
+ALSO CHECK FOR:
+- Violence or gore (blood, injuries, weapons used violently)
+- Disturbing content (shock imagery, extreme content)
+
+Respond in JSON format:
 {
     "is_nsfw": true/false,
+    "nsfw_category": "none|suggestive|partial_nudity|nudity|explicit",
+    "nsfw_severity": 0-100,
     "is_violent": true/false,
     "is_disturbing": true/false,
     "confidence": 0.0-1.0,
-    "description": "Brief description",
+    "description": "Brief description including context",
     "categories": ["category1"]
-}"""
+}
+
+NSFW SEVERITY GUIDELINES: none=0, suggestive=20-35, partial_nudity=40-55, nudity=60-75, explicit=80-100"""
 
-        sensitivity_note = ""
         if sensitivity < 30:
-            sensitivity_note = " Be lenient - only flag explicit content."
+            sensitivity_note = " SENSITIVITY: LENIENT - Allow suggestive content, only flag partial_nudity and above, set is_nsfw=false for suggestive."
         elif sensitivity > 70:
-            sensitivity_note = " Be strict - flag suggestive content."
+            sensitivity_note = " SENSITIVITY: STRICT - Flag suggestive content as NSFW, no tolerance for any nudity regardless of context."
+        else:
+            sensitivity_note = " SENSITIVITY: BALANCED - Allow normal swimwear/fashion, flag partial_nudity and above."
 
         try:
+
             async def _request() -> Any:
                 return await self.client.chat.completions.create(
                     model="gpt-4o-mini",  # Use vision-capable model
@@ -189,6 +215,8 @@ class OpenAIProvider(AIProvider):
                 confidence=float(data.get("confidence", 0.0)),
                 description=data.get("description", ""),
                 categories=data.get("categories", []),
+                nsfw_category=data.get("nsfw_category", "none"),
+                nsfw_severity=int(data.get("nsfw_severity", 0)),
             )
 
         except Exception as e: