improve accuracy
Some checks failed
CI/CD Pipeline / Code Quality Checks (push) Failing after 4m50s
CI/CD Pipeline / Security Scanning (push) Successful in 16s
CI/CD Pipeline / Tests (3.11) (push) Successful in 9m44s
CI/CD Pipeline / Tests (3.12) (push) Successful in 9m37s
CI/CD Pipeline / Build Docker Image (push) Has been skipped
Some checks failed
CI/CD Pipeline / Code Quality Checks (push) Failing after 4m50s
CI/CD Pipeline / Security Scanning (push) Successful in 16s
CI/CD Pipeline / Tests (3.11) (push) Successful in 9m44s
CI/CD Pipeline / Tests (3.12) (push) Successful in 9m37s
CI/CD Pipeline / Build Docker Image (push) Has been skipped
This commit is contained in:
@@ -16,6 +16,17 @@ from guardden.utils.ratelimit import RateLimitExceeded
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _get_action_for_nsfw(category: str) -> str:
|
||||
"""Map NSFW category to suggested action."""
|
||||
mapping = {
|
||||
"suggestive": "warn",
|
||||
"partial_nudity": "delete",
|
||||
"nudity": "delete",
|
||||
"explicit": "timeout",
|
||||
}
|
||||
return mapping.get(category, "none")
|
||||
|
||||
|
||||
class AIModeration(commands.Cog):
|
||||
"""AI-powered content moderation."""
|
||||
|
||||
@@ -300,7 +311,8 @@ class AIModeration(commands.Cog):
|
||||
sensitivity=config.ai_sensitivity,
|
||||
)
|
||||
logger.info(
|
||||
f"Image result: nsfw={image_result.is_nsfw}, violent={image_result.is_violent}, conf={image_result.confidence}"
|
||||
f"Image result: nsfw={image_result.is_nsfw}, category={image_result.nsfw_category}, "
|
||||
f"severity={image_result.nsfw_severity}, violent={image_result.is_violent}, conf={image_result.confidence}"
|
||||
)
|
||||
|
||||
if (
|
||||
@@ -315,12 +327,23 @@ class AIModeration(commands.Cog):
|
||||
if image_result.is_violent:
|
||||
categories.append(ContentCategory.VIOLENCE)
|
||||
|
||||
# Use nsfw_severity if available, otherwise use None for default calculation
|
||||
severity_override = (
|
||||
image_result.nsfw_severity if image_result.nsfw_severity > 0 else None
|
||||
)
|
||||
|
||||
# Include NSFW category in explanation for better logging
|
||||
explanation = image_result.description
|
||||
if image_result.nsfw_category and image_result.nsfw_category != "none":
|
||||
explanation = f"[{image_result.nsfw_category}] {explanation}"
|
||||
|
||||
result = ModerationResult(
|
||||
is_flagged=True,
|
||||
confidence=image_result.confidence,
|
||||
categories=categories,
|
||||
explanation=image_result.description,
|
||||
suggested_action="delete",
|
||||
explanation=explanation,
|
||||
suggested_action=_get_action_for_nsfw(image_result.nsfw_category),
|
||||
severity_override=severity_override,
|
||||
)
|
||||
await self._handle_ai_result(message, result, "Image Analysis")
|
||||
return
|
||||
@@ -346,7 +369,8 @@ class AIModeration(commands.Cog):
|
||||
sensitivity=config.ai_sensitivity,
|
||||
)
|
||||
logger.info(
|
||||
f"Embed image result: nsfw={image_result.is_nsfw}, violent={image_result.is_violent}, conf={image_result.confidence}"
|
||||
f"Embed image result: nsfw={image_result.is_nsfw}, category={image_result.nsfw_category}, "
|
||||
f"severity={image_result.nsfw_severity}, violent={image_result.is_violent}, conf={image_result.confidence}"
|
||||
)
|
||||
|
||||
if (
|
||||
@@ -360,12 +384,23 @@ class AIModeration(commands.Cog):
|
||||
if image_result.is_violent:
|
||||
categories.append(ContentCategory.VIOLENCE)
|
||||
|
||||
# Use nsfw_severity if available, otherwise use None for default calculation
|
||||
severity_override = (
|
||||
image_result.nsfw_severity if image_result.nsfw_severity > 0 else None
|
||||
)
|
||||
|
||||
# Include NSFW category in explanation for better logging
|
||||
explanation = image_result.description
|
||||
if image_result.nsfw_category and image_result.nsfw_category != "none":
|
||||
explanation = f"[{image_result.nsfw_category}] {explanation}"
|
||||
|
||||
result = ModerationResult(
|
||||
is_flagged=True,
|
||||
confidence=image_result.confidence,
|
||||
categories=categories,
|
||||
explanation=image_result.description,
|
||||
suggested_action="delete",
|
||||
explanation=explanation,
|
||||
suggested_action=_get_action_for_nsfw(image_result.nsfw_category),
|
||||
severity_override=severity_override,
|
||||
)
|
||||
await self._handle_ai_result(message, result, "Image Analysis")
|
||||
return
|
||||
|
||||
@@ -38,23 +38,43 @@ Respond in this exact JSON format:
|
||||
|
||||
Be balanced - flag genuinely problematic content but allow normal conversation, jokes, and mild language. Consider context."""
|
||||
|
||||
IMAGE_ANALYSIS_PROMPT = """Analyze this image for content moderation purposes. Check for:
|
||||
- NSFW content (nudity, sexual content)
|
||||
- Violence or gore
|
||||
- Disturbing or shocking content
|
||||
- Any content inappropriate for a general audience
|
||||
IMAGE_ANALYSIS_PROMPT = """Analyze this image for content moderation. Classify NSFW content with precision.
|
||||
|
||||
NSFW CATEGORIES (in order of severity):
|
||||
- none: Safe content, no sexual elements
|
||||
- suggestive: Revealing clothing, provocative poses, cleavage, but no nudity
|
||||
- partial_nudity: Underwear visible, partial exposure, censored nudity
|
||||
- nudity: Full nudity visible (breasts, genitals, buttocks) but no sexual acts
|
||||
- explicit: Sexual acts, pornographic content, genitalia in sexual context
|
||||
|
||||
DETECTION CRITERIA:
|
||||
- Focus on human bodies; anime/artwork should be judged by same standards
|
||||
- Swimwear in beach/pool context = suggestive at most
|
||||
- Artistic nudity (classical art, sculptures) = nudity but note in description
|
||||
- Medical/educational content = note context in description
|
||||
|
||||
ALSO CHECK FOR:
|
||||
- Violence or gore (blood, injuries, weapons used violently)
|
||||
- Disturbing content (shock imagery, extreme content)
|
||||
|
||||
Respond in this exact JSON format:
|
||||
{
|
||||
"is_nsfw": true/false,
|
||||
"nsfw_category": "none|suggestive|partial_nudity|nudity|explicit",
|
||||
"nsfw_severity": 0-100,
|
||||
"is_violent": true/false,
|
||||
"is_disturbing": true/false,
|
||||
"confidence": 0.0-1.0,
|
||||
"description": "Brief description of the image",
|
||||
"description": "Brief description including context",
|
||||
"categories": ["category1", "category2"]
|
||||
}
|
||||
|
||||
Be accurate but not overly sensitive - artistic nudity or mild violence in appropriate contexts may be acceptable."""
|
||||
NSFW SEVERITY GUIDELINES:
|
||||
- none: 0
|
||||
- suggestive: 20-35
|
||||
- partial_nudity: 40-55
|
||||
- nudity: 60-75
|
||||
- explicit: 80-100"""
|
||||
|
||||
PHISHING_ANALYSIS_PROMPT = """Analyze this URL and message context for phishing or scam indicators.
|
||||
|
||||
@@ -97,6 +117,7 @@ class AnthropicProvider(AIProvider):
|
||||
|
||||
async def _call_api(self, system: str, user_content: Any, max_tokens: int = 500) -> str:
|
||||
"""Make an API call to Claude."""
|
||||
|
||||
async def _request() -> str:
|
||||
message = await self.client.messages.create(
|
||||
model=self.model,
|
||||
@@ -180,11 +201,29 @@ class AnthropicProvider(AIProvider):
|
||||
|
||||
import aiohttp
|
||||
|
||||
sensitivity_note = ""
|
||||
if sensitivity < 30:
|
||||
sensitivity_note = "\n\nBe lenient - only flag explicit content."
|
||||
sensitivity_note = """
|
||||
|
||||
SENSITIVITY: LENIENT
|
||||
- Allow suggestive content (swimwear, revealing clothing)
|
||||
- Only flag partial_nudity and above as NSFW
|
||||
- Consider artistic/educational context favorably
|
||||
- Set is_nsfw=false for suggestive content"""
|
||||
elif sensitivity > 70:
|
||||
sensitivity_note = "\n\nBe strict - flag suggestive content as well."
|
||||
sensitivity_note = """
|
||||
|
||||
SENSITIVITY: STRICT
|
||||
- Flag suggestive content as NSFW (is_nsfw=true)
|
||||
- No tolerance for any nudity regardless of context
|
||||
- Provocative poses should be flagged
|
||||
- Lower threshold for nsfw_severity scores"""
|
||||
else:
|
||||
sensitivity_note = """
|
||||
|
||||
SENSITIVITY: BALANCED
|
||||
- Allow normal swimwear/fashion (is_nsfw=false for suggestive)
|
||||
- Flag partial_nudity and above as NSFW
|
||||
- Consider context for edge cases"""
|
||||
|
||||
system = IMAGE_ANALYSIS_PROMPT + sensitivity_note
|
||||
|
||||
@@ -229,6 +268,8 @@ class AnthropicProvider(AIProvider):
|
||||
confidence=float(data.get("confidence", 0.0)),
|
||||
description=data.get("description", ""),
|
||||
categories=data.get("categories", []),
|
||||
nsfw_category=data.get("nsfw_category", "none"),
|
||||
nsfw_severity=int(data.get("nsfw_severity", 0)),
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
|
||||
@@ -23,6 +23,16 @@ class ContentCategory(str, Enum):
|
||||
MISINFORMATION = "misinformation"
|
||||
|
||||
|
||||
class NSFWCategory(str, Enum):
|
||||
"""NSFW content subcategories with increasing severity."""
|
||||
|
||||
NONE = "none"
|
||||
SUGGESTIVE = "suggestive" # Revealing clothing, provocative poses
|
||||
PARTIAL_NUDITY = "partial_nudity" # Partial exposure, underwear
|
||||
NUDITY = "nudity" # Full nudity without sexual acts
|
||||
EXPLICIT = "explicit" # Sexual acts, pornographic content
|
||||
|
||||
|
||||
_T = TypeVar("_T")
|
||||
|
||||
|
||||
@@ -90,6 +100,7 @@ class ModerationResult:
|
||||
categories: list[ContentCategory] = field(default_factory=list)
|
||||
explanation: str = ""
|
||||
suggested_action: Literal["none", "warn", "delete", "timeout", "ban"] = "none"
|
||||
severity_override: int | None = None # Direct severity for NSFW images
|
||||
|
||||
@property
|
||||
def severity(self) -> int:
|
||||
@@ -97,6 +108,10 @@ class ModerationResult:
|
||||
if not self.is_flagged:
|
||||
return 0
|
||||
|
||||
# Use override if provided (e.g., from NSFW image analysis)
|
||||
if self.severity_override is not None:
|
||||
return min(self.severity_override, 100)
|
||||
|
||||
# Base severity from confidence
|
||||
severity = int(self.confidence * 50)
|
||||
|
||||
@@ -133,6 +148,8 @@ class ImageAnalysisResult:
|
||||
confidence: float = 0.0
|
||||
description: str = ""
|
||||
categories: list[str] = field(default_factory=list)
|
||||
nsfw_category: str = "none" # Maps to NSFWCategory values
|
||||
nsfw_severity: int = 0 # 0-100 specific NSFW severity score
|
||||
|
||||
|
||||
@dataclass
|
||||
|
||||
@@ -42,6 +42,7 @@ class OpenAIProvider(AIProvider):
|
||||
max_tokens: int = 500,
|
||||
) -> str:
|
||||
"""Make an API call to OpenAI."""
|
||||
|
||||
async def _request() -> str:
|
||||
response = await self.client.chat.completions.create(
|
||||
model=self.model,
|
||||
@@ -79,6 +80,7 @@ class OpenAIProvider(AIProvider):
|
||||
"""Analyze text content for policy violations."""
|
||||
# First, use OpenAI's built-in moderation API for quick check
|
||||
try:
|
||||
|
||||
async def _moderate() -> Any:
|
||||
return await self.client.moderations.create(input=content)
|
||||
|
||||
@@ -140,23 +142,47 @@ class OpenAIProvider(AIProvider):
|
||||
sensitivity: int = 50,
|
||||
) -> ImageAnalysisResult:
|
||||
"""Analyze an image for NSFW or inappropriate content."""
|
||||
system = """Analyze this image for content moderation. Respond in JSON format:
|
||||
system = """Analyze this image for content moderation. Classify NSFW content with precision.
|
||||
|
||||
NSFW CATEGORIES (in order of severity):
|
||||
- none: Safe content, no sexual elements
|
||||
- suggestive: Revealing clothing, provocative poses, cleavage, but no nudity
|
||||
- partial_nudity: Underwear visible, partial exposure, censored nudity
|
||||
- nudity: Full nudity visible (breasts, genitals, buttocks) but no sexual acts
|
||||
- explicit: Sexual acts, pornographic content, genitalia in sexual context
|
||||
|
||||
DETECTION CRITERIA:
|
||||
- Focus on human bodies; anime/artwork should be judged by same standards
|
||||
- Swimwear in beach/pool context = suggestive at most
|
||||
- Artistic nudity (classical art, sculptures) = nudity but note in description
|
||||
|
||||
ALSO CHECK FOR:
|
||||
- Violence or gore (blood, injuries, weapons used violently)
|
||||
- Disturbing content (shock imagery, extreme content)
|
||||
|
||||
Respond in JSON format:
|
||||
{
|
||||
"is_nsfw": true/false,
|
||||
"nsfw_category": "none|suggestive|partial_nudity|nudity|explicit",
|
||||
"nsfw_severity": 0-100,
|
||||
"is_violent": true/false,
|
||||
"is_disturbing": true/false,
|
||||
"confidence": 0.0-1.0,
|
||||
"description": "Brief description",
|
||||
"description": "Brief description including context",
|
||||
"categories": ["category1"]
|
||||
}"""
|
||||
}
|
||||
|
||||
NSFW SEVERITY GUIDELINES: none=0, suggestive=20-35, partial_nudity=40-55, nudity=60-75, explicit=80-100"""
|
||||
|
||||
sensitivity_note = ""
|
||||
if sensitivity < 30:
|
||||
sensitivity_note = " Be lenient - only flag explicit content."
|
||||
sensitivity_note = " SENSITIVITY: LENIENT - Allow suggestive content, only flag partial_nudity and above, set is_nsfw=false for suggestive."
|
||||
elif sensitivity > 70:
|
||||
sensitivity_note = " Be strict - flag suggestive content."
|
||||
sensitivity_note = " SENSITIVITY: STRICT - Flag suggestive content as NSFW, no tolerance for any nudity regardless of context."
|
||||
else:
|
||||
sensitivity_note = " SENSITIVITY: BALANCED - Allow normal swimwear/fashion, flag partial_nudity and above."
|
||||
|
||||
try:
|
||||
|
||||
async def _request() -> Any:
|
||||
return await self.client.chat.completions.create(
|
||||
model="gpt-4o-mini", # Use vision-capable model
|
||||
@@ -189,6 +215,8 @@ class OpenAIProvider(AIProvider):
|
||||
confidence=float(data.get("confidence", 0.0)),
|
||||
description=data.get("description", ""),
|
||||
categories=data.get("categories", []),
|
||||
nsfw_category=data.get("nsfw_category", "none"),
|
||||
nsfw_severity=int(data.get("nsfw_severity", 0)),
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
|
||||
Reference in New Issue
Block a user