improve accuracy
Some checks failed
CI/CD Pipeline / Code Quality Checks (push) Failing after 4m50s
CI/CD Pipeline / Security Scanning (push) Successful in 16s
CI/CD Pipeline / Tests (3.11) (push) Successful in 9m44s
CI/CD Pipeline / Tests (3.12) (push) Successful in 9m37s
CI/CD Pipeline / Build Docker Image (push) Has been skipped

This commit is contained in:
2026-01-24 17:37:09 +01:00
parent 136ae04388
commit a5811113f0
4 changed files with 143 additions and 22 deletions

View File

@@ -16,6 +16,17 @@ from guardden.utils.ratelimit import RateLimitExceeded
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
def _get_action_for_nsfw(category: str) -> str:
"""Map NSFW category to suggested action."""
mapping = {
"suggestive": "warn",
"partial_nudity": "delete",
"nudity": "delete",
"explicit": "timeout",
}
return mapping.get(category, "none")
class AIModeration(commands.Cog): class AIModeration(commands.Cog):
"""AI-powered content moderation.""" """AI-powered content moderation."""
@@ -300,7 +311,8 @@ class AIModeration(commands.Cog):
sensitivity=config.ai_sensitivity, sensitivity=config.ai_sensitivity,
) )
logger.info( logger.info(
f"Image result: nsfw={image_result.is_nsfw}, violent={image_result.is_violent}, conf={image_result.confidence}" f"Image result: nsfw={image_result.is_nsfw}, category={image_result.nsfw_category}, "
f"severity={image_result.nsfw_severity}, violent={image_result.is_violent}, conf={image_result.confidence}"
) )
if ( if (
@@ -315,12 +327,23 @@ class AIModeration(commands.Cog):
if image_result.is_violent: if image_result.is_violent:
categories.append(ContentCategory.VIOLENCE) categories.append(ContentCategory.VIOLENCE)
# Use nsfw_severity if available, otherwise use None for default calculation
severity_override = (
image_result.nsfw_severity if image_result.nsfw_severity > 0 else None
)
# Include NSFW category in explanation for better logging
explanation = image_result.description
if image_result.nsfw_category and image_result.nsfw_category != "none":
explanation = f"[{image_result.nsfw_category}] {explanation}"
result = ModerationResult( result = ModerationResult(
is_flagged=True, is_flagged=True,
confidence=image_result.confidence, confidence=image_result.confidence,
categories=categories, categories=categories,
explanation=image_result.description, explanation=explanation,
suggested_action="delete", suggested_action=_get_action_for_nsfw(image_result.nsfw_category),
severity_override=severity_override,
) )
await self._handle_ai_result(message, result, "Image Analysis") await self._handle_ai_result(message, result, "Image Analysis")
return return
@@ -346,7 +369,8 @@ class AIModeration(commands.Cog):
sensitivity=config.ai_sensitivity, sensitivity=config.ai_sensitivity,
) )
logger.info( logger.info(
f"Embed image result: nsfw={image_result.is_nsfw}, violent={image_result.is_violent}, conf={image_result.confidence}" f"Embed image result: nsfw={image_result.is_nsfw}, category={image_result.nsfw_category}, "
f"severity={image_result.nsfw_severity}, violent={image_result.is_violent}, conf={image_result.confidence}"
) )
if ( if (
@@ -360,12 +384,23 @@ class AIModeration(commands.Cog):
if image_result.is_violent: if image_result.is_violent:
categories.append(ContentCategory.VIOLENCE) categories.append(ContentCategory.VIOLENCE)
# Use nsfw_severity if available, otherwise use None for default calculation
severity_override = (
image_result.nsfw_severity if image_result.nsfw_severity > 0 else None
)
# Include NSFW category in explanation for better logging
explanation = image_result.description
if image_result.nsfw_category and image_result.nsfw_category != "none":
explanation = f"[{image_result.nsfw_category}] {explanation}"
result = ModerationResult( result = ModerationResult(
is_flagged=True, is_flagged=True,
confidence=image_result.confidence, confidence=image_result.confidence,
categories=categories, categories=categories,
explanation=image_result.description, explanation=explanation,
suggested_action="delete", suggested_action=_get_action_for_nsfw(image_result.nsfw_category),
severity_override=severity_override,
) )
await self._handle_ai_result(message, result, "Image Analysis") await self._handle_ai_result(message, result, "Image Analysis")
return return

View File

@@ -38,23 +38,43 @@ Respond in this exact JSON format:
Be balanced - flag genuinely problematic content but allow normal conversation, jokes, and mild language. Consider context.""" Be balanced - flag genuinely problematic content but allow normal conversation, jokes, and mild language. Consider context."""
IMAGE_ANALYSIS_PROMPT = """Analyze this image for content moderation purposes. Check for: IMAGE_ANALYSIS_PROMPT = """Analyze this image for content moderation. Classify NSFW content with precision.
- NSFW content (nudity, sexual content)
- Violence or gore NSFW CATEGORIES (in order of severity):
- Disturbing or shocking content - none: Safe content, no sexual elements
- Any content inappropriate for a general audience - suggestive: Revealing clothing, provocative poses, cleavage, but no nudity
- partial_nudity: Underwear visible, partial exposure, censored nudity
- nudity: Full nudity visible (breasts, genitals, buttocks) but no sexual acts
- explicit: Sexual acts, pornographic content, genitalia in sexual context
DETECTION CRITERIA:
- Focus on human bodies; anime/artwork should be judged by same standards
- Swimwear in beach/pool context = suggestive at most
- Artistic nudity (classical art, sculptures) = nudity but note in description
- Medical/educational content = note context in description
ALSO CHECK FOR:
- Violence or gore (blood, injuries, weapons used violently)
- Disturbing content (shock imagery, extreme content)
Respond in this exact JSON format: Respond in this exact JSON format:
{ {
"is_nsfw": true/false, "is_nsfw": true/false,
"nsfw_category": "none|suggestive|partial_nudity|nudity|explicit",
"nsfw_severity": 0-100,
"is_violent": true/false, "is_violent": true/false,
"is_disturbing": true/false, "is_disturbing": true/false,
"confidence": 0.0-1.0, "confidence": 0.0-1.0,
"description": "Brief description of the image", "description": "Brief description including context",
"categories": ["category1", "category2"] "categories": ["category1", "category2"]
} }
Be accurate but not overly sensitive - artistic nudity or mild violence in appropriate contexts may be acceptable.""" NSFW SEVERITY GUIDELINES:
- none: 0
- suggestive: 20-35
- partial_nudity: 40-55
- nudity: 60-75
- explicit: 80-100"""
PHISHING_ANALYSIS_PROMPT = """Analyze this URL and message context for phishing or scam indicators. PHISHING_ANALYSIS_PROMPT = """Analyze this URL and message context for phishing or scam indicators.
@@ -97,6 +117,7 @@ class AnthropicProvider(AIProvider):
async def _call_api(self, system: str, user_content: Any, max_tokens: int = 500) -> str: async def _call_api(self, system: str, user_content: Any, max_tokens: int = 500) -> str:
"""Make an API call to Claude.""" """Make an API call to Claude."""
async def _request() -> str: async def _request() -> str:
message = await self.client.messages.create( message = await self.client.messages.create(
model=self.model, model=self.model,
@@ -180,11 +201,29 @@ class AnthropicProvider(AIProvider):
import aiohttp import aiohttp
sensitivity_note = ""
if sensitivity < 30: if sensitivity < 30:
sensitivity_note = "\n\nBe lenient - only flag explicit content." sensitivity_note = """
SENSITIVITY: LENIENT
- Allow suggestive content (swimwear, revealing clothing)
- Only flag partial_nudity and above as NSFW
- Consider artistic/educational context favorably
- Set is_nsfw=false for suggestive content"""
elif sensitivity > 70: elif sensitivity > 70:
sensitivity_note = "\n\nBe strict - flag suggestive content as well." sensitivity_note = """
SENSITIVITY: STRICT
- Flag suggestive content as NSFW (is_nsfw=true)
- No tolerance for any nudity regardless of context
- Provocative poses should be flagged
- Lower threshold for nsfw_severity scores"""
else:
sensitivity_note = """
SENSITIVITY: BALANCED
- Allow normal swimwear/fashion (is_nsfw=false for suggestive)
- Flag partial_nudity and above as NSFW
- Consider context for edge cases"""
system = IMAGE_ANALYSIS_PROMPT + sensitivity_note system = IMAGE_ANALYSIS_PROMPT + sensitivity_note
@@ -229,6 +268,8 @@ class AnthropicProvider(AIProvider):
confidence=float(data.get("confidence", 0.0)), confidence=float(data.get("confidence", 0.0)),
description=data.get("description", ""), description=data.get("description", ""),
categories=data.get("categories", []), categories=data.get("categories", []),
nsfw_category=data.get("nsfw_category", "none"),
nsfw_severity=int(data.get("nsfw_severity", 0)),
) )
except Exception as e: except Exception as e:

View File

@@ -23,6 +23,16 @@ class ContentCategory(str, Enum):
MISINFORMATION = "misinformation" MISINFORMATION = "misinformation"
class NSFWCategory(str, Enum):
"""NSFW content subcategories with increasing severity."""
NONE = "none"
SUGGESTIVE = "suggestive" # Revealing clothing, provocative poses
PARTIAL_NUDITY = "partial_nudity" # Partial exposure, underwear
NUDITY = "nudity" # Full nudity without sexual acts
EXPLICIT = "explicit" # Sexual acts, pornographic content
_T = TypeVar("_T") _T = TypeVar("_T")
@@ -90,6 +100,7 @@ class ModerationResult:
categories: list[ContentCategory] = field(default_factory=list) categories: list[ContentCategory] = field(default_factory=list)
explanation: str = "" explanation: str = ""
suggested_action: Literal["none", "warn", "delete", "timeout", "ban"] = "none" suggested_action: Literal["none", "warn", "delete", "timeout", "ban"] = "none"
severity_override: int | None = None # Direct severity for NSFW images
@property @property
def severity(self) -> int: def severity(self) -> int:
@@ -97,6 +108,10 @@ class ModerationResult:
if not self.is_flagged: if not self.is_flagged:
return 0 return 0
# Use override if provided (e.g., from NSFW image analysis)
if self.severity_override is not None:
return min(self.severity_override, 100)
# Base severity from confidence # Base severity from confidence
severity = int(self.confidence * 50) severity = int(self.confidence * 50)
@@ -133,6 +148,8 @@ class ImageAnalysisResult:
confidence: float = 0.0 confidence: float = 0.0
description: str = "" description: str = ""
categories: list[str] = field(default_factory=list) categories: list[str] = field(default_factory=list)
nsfw_category: str = "none" # Maps to NSFWCategory values
nsfw_severity: int = 0 # 0-100 specific NSFW severity score
@dataclass @dataclass

View File

@@ -42,6 +42,7 @@ class OpenAIProvider(AIProvider):
max_tokens: int = 500, max_tokens: int = 500,
) -> str: ) -> str:
"""Make an API call to OpenAI.""" """Make an API call to OpenAI."""
async def _request() -> str: async def _request() -> str:
response = await self.client.chat.completions.create( response = await self.client.chat.completions.create(
model=self.model, model=self.model,
@@ -79,6 +80,7 @@ class OpenAIProvider(AIProvider):
"""Analyze text content for policy violations.""" """Analyze text content for policy violations."""
# First, use OpenAI's built-in moderation API for quick check # First, use OpenAI's built-in moderation API for quick check
try: try:
async def _moderate() -> Any: async def _moderate() -> Any:
return await self.client.moderations.create(input=content) return await self.client.moderations.create(input=content)
@@ -140,23 +142,47 @@ class OpenAIProvider(AIProvider):
sensitivity: int = 50, sensitivity: int = 50,
) -> ImageAnalysisResult: ) -> ImageAnalysisResult:
"""Analyze an image for NSFW or inappropriate content.""" """Analyze an image for NSFW or inappropriate content."""
system = """Analyze this image for content moderation. Respond in JSON format: system = """Analyze this image for content moderation. Classify NSFW content with precision.
NSFW CATEGORIES (in order of severity):
- none: Safe content, no sexual elements
- suggestive: Revealing clothing, provocative poses, cleavage, but no nudity
- partial_nudity: Underwear visible, partial exposure, censored nudity
- nudity: Full nudity visible (breasts, genitals, buttocks) but no sexual acts
- explicit: Sexual acts, pornographic content, genitalia in sexual context
DETECTION CRITERIA:
- Focus on human bodies; anime/artwork should be judged by same standards
- Swimwear in beach/pool context = suggestive at most
- Artistic nudity (classical art, sculptures) = nudity but note in description
ALSO CHECK FOR:
- Violence or gore (blood, injuries, weapons used violently)
- Disturbing content (shock imagery, extreme content)
Respond in JSON format:
{ {
"is_nsfw": true/false, "is_nsfw": true/false,
"nsfw_category": "none|suggestive|partial_nudity|nudity|explicit",
"nsfw_severity": 0-100,
"is_violent": true/false, "is_violent": true/false,
"is_disturbing": true/false, "is_disturbing": true/false,
"confidence": 0.0-1.0, "confidence": 0.0-1.0,
"description": "Brief description", "description": "Brief description including context",
"categories": ["category1"] "categories": ["category1"]
}""" }
NSFW SEVERITY GUIDELINES: none=0, suggestive=20-35, partial_nudity=40-55, nudity=60-75, explicit=80-100"""
sensitivity_note = ""
if sensitivity < 30: if sensitivity < 30:
sensitivity_note = " Be lenient - only flag explicit content." sensitivity_note = " SENSITIVITY: LENIENT - Allow suggestive content, only flag partial_nudity and above, set is_nsfw=false for suggestive."
elif sensitivity > 70: elif sensitivity > 70:
sensitivity_note = " Be strict - flag suggestive content." sensitivity_note = " SENSITIVITY: STRICT - Flag suggestive content as NSFW, no tolerance for any nudity regardless of context."
else:
sensitivity_note = " SENSITIVITY: BALANCED - Allow normal swimwear/fashion, flag partial_nudity and above."
try: try:
async def _request() -> Any: async def _request() -> Any:
return await self.client.chat.completions.create( return await self.client.chat.completions.create(
model="gpt-4o-mini", # Use vision-capable model model="gpt-4o-mini", # Use vision-capable model
@@ -189,6 +215,8 @@ class OpenAIProvider(AIProvider):
confidence=float(data.get("confidence", 0.0)), confidence=float(data.get("confidence", 0.0)),
description=data.get("description", ""), description=data.get("description", ""),
categories=data.get("categories", []), categories=data.get("categories", []),
nsfw_category=data.get("nsfw_category", "none"),
nsfw_severity=int(data.get("nsfw_severity", 0)),
) )
except Exception as e: except Exception as e: