From a5811113f00d478762878af7508c66797b50b23e Mon Sep 17 00:00:00 2001 From: latte Date: Sat, 24 Jan 2026 17:37:09 +0100 Subject: [PATCH] improve accuracy --- src/guardden/cogs/ai_moderation.py | 47 ++++++++++++-- .../services/ai/anthropic_provider.py | 61 ++++++++++++++++--- src/guardden/services/ai/base.py | 17 ++++++ src/guardden/services/ai/openai_provider.py | 40 ++++++++++-- 4 files changed, 143 insertions(+), 22 deletions(-) diff --git a/src/guardden/cogs/ai_moderation.py b/src/guardden/cogs/ai_moderation.py index 64a50a6..0554edd 100644 --- a/src/guardden/cogs/ai_moderation.py +++ b/src/guardden/cogs/ai_moderation.py @@ -16,6 +16,17 @@ from guardden.utils.ratelimit import RateLimitExceeded logger = logging.getLogger(__name__) +def _get_action_for_nsfw(category: str) -> str: + """Map NSFW category to suggested action.""" + mapping = { + "suggestive": "warn", + "partial_nudity": "delete", + "nudity": "delete", + "explicit": "timeout", + } + return mapping.get(category, "none") + + class AIModeration(commands.Cog): """AI-powered content moderation.""" @@ -300,7 +311,8 @@ class AIModeration(commands.Cog): sensitivity=config.ai_sensitivity, ) logger.info( - f"Image result: nsfw={image_result.is_nsfw}, violent={image_result.is_violent}, conf={image_result.confidence}" + f"Image result: nsfw={image_result.is_nsfw}, category={image_result.nsfw_category}, " + f"severity={image_result.nsfw_severity}, violent={image_result.is_violent}, conf={image_result.confidence}" ) if ( @@ -315,12 +327,23 @@ class AIModeration(commands.Cog): if image_result.is_violent: categories.append(ContentCategory.VIOLENCE) + # Use nsfw_severity if available, otherwise use None for default calculation + severity_override = ( + image_result.nsfw_severity if image_result.nsfw_severity > 0 else None + ) + + # Include NSFW category in explanation for better logging + explanation = image_result.description + if image_result.nsfw_category and image_result.nsfw_category != "none": + explanation = f"[{image_result.nsfw_category}] {explanation}" + result = ModerationResult( is_flagged=True, confidence=image_result.confidence, categories=categories, - explanation=image_result.description, - suggested_action="delete", + explanation=explanation, + suggested_action=_get_action_for_nsfw(image_result.nsfw_category), + severity_override=severity_override, ) await self._handle_ai_result(message, result, "Image Analysis") return @@ -346,7 +369,8 @@ class AIModeration(commands.Cog): sensitivity=config.ai_sensitivity, ) logger.info( - f"Embed image result: nsfw={image_result.is_nsfw}, violent={image_result.is_violent}, conf={image_result.confidence}" + f"Embed image result: nsfw={image_result.is_nsfw}, category={image_result.nsfw_category}, " + f"severity={image_result.nsfw_severity}, violent={image_result.is_violent}, conf={image_result.confidence}" ) if ( @@ -360,12 +384,23 @@ class AIModeration(commands.Cog): if image_result.is_violent: categories.append(ContentCategory.VIOLENCE) + # Use nsfw_severity if available, otherwise use None for default calculation + severity_override = ( + image_result.nsfw_severity if image_result.nsfw_severity > 0 else None + ) + + # Include NSFW category in explanation for better logging + explanation = image_result.description + if image_result.nsfw_category and image_result.nsfw_category != "none": + explanation = f"[{image_result.nsfw_category}] {explanation}" + result = ModerationResult( is_flagged=True, confidence=image_result.confidence, categories=categories, - explanation=image_result.description, - suggested_action="delete", + explanation=explanation, + suggested_action=_get_action_for_nsfw(image_result.nsfw_category), + severity_override=severity_override, ) await self._handle_ai_result(message, result, "Image Analysis") return diff --git a/src/guardden/services/ai/anthropic_provider.py b/src/guardden/services/ai/anthropic_provider.py index 4b100c5..e729bf2 100644 --- a/src/guardden/services/ai/anthropic_provider.py +++ b/src/guardden/services/ai/anthropic_provider.py @@ -38,23 +38,43 @@ Respond in this exact JSON format: Be balanced - flag genuinely problematic content but allow normal conversation, jokes, and mild language. Consider context.""" -IMAGE_ANALYSIS_PROMPT = """Analyze this image for content moderation purposes. Check for: -- NSFW content (nudity, sexual content) -- Violence or gore -- Disturbing or shocking content -- Any content inappropriate for a general audience +IMAGE_ANALYSIS_PROMPT = """Analyze this image for content moderation. Classify NSFW content with precision. + +NSFW CATEGORIES (in order of severity): +- none: Safe content, no sexual elements +- suggestive: Revealing clothing, provocative poses, cleavage, but no nudity +- partial_nudity: Underwear visible, partial exposure, censored nudity +- nudity: Full nudity visible (breasts, genitals, buttocks) but no sexual acts +- explicit: Sexual acts, pornographic content, genitalia in sexual context + +DETECTION CRITERIA: +- Focus on human bodies; anime/artwork should be judged by same standards +- Swimwear in beach/pool context = suggestive at most +- Artistic nudity (classical art, sculptures) = nudity but note in description +- Medical/educational content = note context in description + +ALSO CHECK FOR: +- Violence or gore (blood, injuries, weapons used violently) +- Disturbing content (shock imagery, extreme content) Respond in this exact JSON format: { "is_nsfw": true/false, + "nsfw_category": "none|suggestive|partial_nudity|nudity|explicit", + "nsfw_severity": 0-100, "is_violent": true/false, "is_disturbing": true/false, "confidence": 0.0-1.0, - "description": "Brief description of the image", + "description": "Brief description including context", "categories": ["category1", "category2"] } -Be accurate but not overly sensitive - artistic nudity or mild violence in appropriate contexts may be acceptable.""" +NSFW SEVERITY GUIDELINES: +- none: 0 +- suggestive: 20-35 +- partial_nudity: 40-55 +- nudity: 60-75 +- explicit: 80-100""" PHISHING_ANALYSIS_PROMPT = """Analyze this URL and message context for phishing or scam indicators. @@ -97,6 +117,7 @@ class AnthropicProvider(AIProvider): async def _call_api(self, system: str, user_content: Any, max_tokens: int = 500) -> str: """Make an API call to Claude.""" + async def _request() -> str: message = await self.client.messages.create( model=self.model, @@ -180,11 +201,29 @@ class AnthropicProvider(AIProvider): import aiohttp - sensitivity_note = "" if sensitivity < 30: - sensitivity_note = "\n\nBe lenient - only flag explicit content." + sensitivity_note = """ + +SENSITIVITY: LENIENT +- Allow suggestive content (swimwear, revealing clothing) +- Only flag partial_nudity and above as NSFW +- Consider artistic/educational context favorably +- Set is_nsfw=false for suggestive content""" elif sensitivity > 70: - sensitivity_note = "\n\nBe strict - flag suggestive content as well." + sensitivity_note = """ + +SENSITIVITY: STRICT +- Flag suggestive content as NSFW (is_nsfw=true) +- No tolerance for any nudity regardless of context +- Provocative poses should be flagged +- Lower threshold for nsfw_severity scores""" + else: + sensitivity_note = """ + +SENSITIVITY: BALANCED +- Allow normal swimwear/fashion (is_nsfw=false for suggestive) +- Flag partial_nudity and above as NSFW +- Consider context for edge cases""" system = IMAGE_ANALYSIS_PROMPT + sensitivity_note @@ -229,6 +268,8 @@ class AnthropicProvider(AIProvider): confidence=float(data.get("confidence", 0.0)), description=data.get("description", ""), categories=data.get("categories", []), + nsfw_category=data.get("nsfw_category", "none"), + nsfw_severity=int(data.get("nsfw_severity", 0)), ) except Exception as e: diff --git a/src/guardden/services/ai/base.py b/src/guardden/services/ai/base.py index 40e6752..29bd789 100644 --- a/src/guardden/services/ai/base.py +++ b/src/guardden/services/ai/base.py @@ -23,6 +23,16 @@ class ContentCategory(str, Enum): MISINFORMATION = "misinformation" +class NSFWCategory(str, Enum): + """NSFW content subcategories with increasing severity.""" + + NONE = "none" + SUGGESTIVE = "suggestive" # Revealing clothing, provocative poses + PARTIAL_NUDITY = "partial_nudity" # Partial exposure, underwear + NUDITY = "nudity" # Full nudity without sexual acts + EXPLICIT = "explicit" # Sexual acts, pornographic content + + _T = TypeVar("_T") @@ -90,6 +100,7 @@ class ModerationResult: categories: list[ContentCategory] = field(default_factory=list) explanation: str = "" suggested_action: Literal["none", "warn", "delete", "timeout", "ban"] = "none" + severity_override: int | None = None # Direct severity for NSFW images @property def severity(self) -> int: @@ -97,6 +108,10 @@ class ModerationResult: if not self.is_flagged: return 0 + # Use override if provided (e.g., from NSFW image analysis) + if self.severity_override is not None: + return min(self.severity_override, 100) + # Base severity from confidence severity = int(self.confidence * 50) @@ -133,6 +148,8 @@ class ImageAnalysisResult: confidence: float = 0.0 description: str = "" categories: list[str] = field(default_factory=list) + nsfw_category: str = "none" # Maps to NSFWCategory values + nsfw_severity: int = 0 # 0-100 specific NSFW severity score @dataclass diff --git a/src/guardden/services/ai/openai_provider.py b/src/guardden/services/ai/openai_provider.py index a82cbcc..103a2b1 100644 --- a/src/guardden/services/ai/openai_provider.py +++ b/src/guardden/services/ai/openai_provider.py @@ -42,6 +42,7 @@ class OpenAIProvider(AIProvider): max_tokens: int = 500, ) -> str: """Make an API call to OpenAI.""" + async def _request() -> str: response = await self.client.chat.completions.create( model=self.model, @@ -79,6 +80,7 @@ class OpenAIProvider(AIProvider): """Analyze text content for policy violations.""" # First, use OpenAI's built-in moderation API for quick check try: + async def _moderate() -> Any: return await self.client.moderations.create(input=content) @@ -140,23 +142,47 @@ class OpenAIProvider(AIProvider): sensitivity: int = 50, ) -> ImageAnalysisResult: """Analyze an image for NSFW or inappropriate content.""" - system = """Analyze this image for content moderation. Respond in JSON format: + system = """Analyze this image for content moderation. Classify NSFW content with precision. + +NSFW CATEGORIES (in order of severity): +- none: Safe content, no sexual elements +- suggestive: Revealing clothing, provocative poses, cleavage, but no nudity +- partial_nudity: Underwear visible, partial exposure, censored nudity +- nudity: Full nudity visible (breasts, genitals, buttocks) but no sexual acts +- explicit: Sexual acts, pornographic content, genitalia in sexual context + +DETECTION CRITERIA: +- Focus on human bodies; anime/artwork should be judged by same standards +- Swimwear in beach/pool context = suggestive at most +- Artistic nudity (classical art, sculptures) = nudity but note in description + +ALSO CHECK FOR: +- Violence or gore (blood, injuries, weapons used violently) +- Disturbing content (shock imagery, extreme content) + +Respond in JSON format: { "is_nsfw": true/false, + "nsfw_category": "none|suggestive|partial_nudity|nudity|explicit", + "nsfw_severity": 0-100, "is_violent": true/false, "is_disturbing": true/false, "confidence": 0.0-1.0, - "description": "Brief description", + "description": "Brief description including context", "categories": ["category1"] -}""" +} + +NSFW SEVERITY GUIDELINES: none=0, suggestive=20-35, partial_nudity=40-55, nudity=60-75, explicit=80-100""" - sensitivity_note = "" if sensitivity < 30: - sensitivity_note = " Be lenient - only flag explicit content." + sensitivity_note = " SENSITIVITY: LENIENT - Allow suggestive content, only flag partial_nudity and above, set is_nsfw=false for suggestive." elif sensitivity > 70: - sensitivity_note = " Be strict - flag suggestive content." + sensitivity_note = " SENSITIVITY: STRICT - Flag suggestive content as NSFW, no tolerance for any nudity regardless of context." + else: + sensitivity_note = " SENSITIVITY: BALANCED - Allow normal swimwear/fashion, flag partial_nudity and above." try: + async def _request() -> Any: return await self.client.chat.completions.create( model="gpt-4o-mini", # Use vision-capable model @@ -189,6 +215,8 @@ class OpenAIProvider(AIProvider): confidence=float(data.get("confidence", 0.0)), description=data.get("description", ""), categories=data.get("categories", []), + nsfw_category=data.get("nsfw_category", "none"), + nsfw_severity=int(data.get("nsfw_severity", 0)), ) except Exception as e: