improve accuracy
Some checks failed
CI/CD Pipeline / Code Quality Checks (push) Failing after 4m50s
CI/CD Pipeline / Security Scanning (push) Successful in 16s
CI/CD Pipeline / Tests (3.11) (push) Successful in 9m44s
CI/CD Pipeline / Tests (3.12) (push) Successful in 9m37s
CI/CD Pipeline / Build Docker Image (push) Has been skipped
Some checks failed
CI/CD Pipeline / Code Quality Checks (push) Failing after 4m50s
CI/CD Pipeline / Security Scanning (push) Successful in 16s
CI/CD Pipeline / Tests (3.11) (push) Successful in 9m44s
CI/CD Pipeline / Tests (3.12) (push) Successful in 9m37s
CI/CD Pipeline / Build Docker Image (push) Has been skipped
This commit is contained in:
@@ -16,6 +16,17 @@ from guardden.utils.ratelimit import RateLimitExceeded
|
|||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def _get_action_for_nsfw(category: str) -> str:
|
||||||
|
"""Map NSFW category to suggested action."""
|
||||||
|
mapping = {
|
||||||
|
"suggestive": "warn",
|
||||||
|
"partial_nudity": "delete",
|
||||||
|
"nudity": "delete",
|
||||||
|
"explicit": "timeout",
|
||||||
|
}
|
||||||
|
return mapping.get(category, "none")
|
||||||
|
|
||||||
|
|
||||||
class AIModeration(commands.Cog):
|
class AIModeration(commands.Cog):
|
||||||
"""AI-powered content moderation."""
|
"""AI-powered content moderation."""
|
||||||
|
|
||||||
@@ -300,7 +311,8 @@ class AIModeration(commands.Cog):
|
|||||||
sensitivity=config.ai_sensitivity,
|
sensitivity=config.ai_sensitivity,
|
||||||
)
|
)
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Image result: nsfw={image_result.is_nsfw}, violent={image_result.is_violent}, conf={image_result.confidence}"
|
f"Image result: nsfw={image_result.is_nsfw}, category={image_result.nsfw_category}, "
|
||||||
|
f"severity={image_result.nsfw_severity}, violent={image_result.is_violent}, conf={image_result.confidence}"
|
||||||
)
|
)
|
||||||
|
|
||||||
if (
|
if (
|
||||||
@@ -315,12 +327,23 @@ class AIModeration(commands.Cog):
|
|||||||
if image_result.is_violent:
|
if image_result.is_violent:
|
||||||
categories.append(ContentCategory.VIOLENCE)
|
categories.append(ContentCategory.VIOLENCE)
|
||||||
|
|
||||||
|
# Use nsfw_severity if available, otherwise use None for default calculation
|
||||||
|
severity_override = (
|
||||||
|
image_result.nsfw_severity if image_result.nsfw_severity > 0 else None
|
||||||
|
)
|
||||||
|
|
||||||
|
# Include NSFW category in explanation for better logging
|
||||||
|
explanation = image_result.description
|
||||||
|
if image_result.nsfw_category and image_result.nsfw_category != "none":
|
||||||
|
explanation = f"[{image_result.nsfw_category}] {explanation}"
|
||||||
|
|
||||||
result = ModerationResult(
|
result = ModerationResult(
|
||||||
is_flagged=True,
|
is_flagged=True,
|
||||||
confidence=image_result.confidence,
|
confidence=image_result.confidence,
|
||||||
categories=categories,
|
categories=categories,
|
||||||
explanation=image_result.description,
|
explanation=explanation,
|
||||||
suggested_action="delete",
|
suggested_action=_get_action_for_nsfw(image_result.nsfw_category),
|
||||||
|
severity_override=severity_override,
|
||||||
)
|
)
|
||||||
await self._handle_ai_result(message, result, "Image Analysis")
|
await self._handle_ai_result(message, result, "Image Analysis")
|
||||||
return
|
return
|
||||||
@@ -346,7 +369,8 @@ class AIModeration(commands.Cog):
|
|||||||
sensitivity=config.ai_sensitivity,
|
sensitivity=config.ai_sensitivity,
|
||||||
)
|
)
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Embed image result: nsfw={image_result.is_nsfw}, violent={image_result.is_violent}, conf={image_result.confidence}"
|
f"Embed image result: nsfw={image_result.is_nsfw}, category={image_result.nsfw_category}, "
|
||||||
|
f"severity={image_result.nsfw_severity}, violent={image_result.is_violent}, conf={image_result.confidence}"
|
||||||
)
|
)
|
||||||
|
|
||||||
if (
|
if (
|
||||||
@@ -360,12 +384,23 @@ class AIModeration(commands.Cog):
|
|||||||
if image_result.is_violent:
|
if image_result.is_violent:
|
||||||
categories.append(ContentCategory.VIOLENCE)
|
categories.append(ContentCategory.VIOLENCE)
|
||||||
|
|
||||||
|
# Use nsfw_severity if available, otherwise use None for default calculation
|
||||||
|
severity_override = (
|
||||||
|
image_result.nsfw_severity if image_result.nsfw_severity > 0 else None
|
||||||
|
)
|
||||||
|
|
||||||
|
# Include NSFW category in explanation for better logging
|
||||||
|
explanation = image_result.description
|
||||||
|
if image_result.nsfw_category and image_result.nsfw_category != "none":
|
||||||
|
explanation = f"[{image_result.nsfw_category}] {explanation}"
|
||||||
|
|
||||||
result = ModerationResult(
|
result = ModerationResult(
|
||||||
is_flagged=True,
|
is_flagged=True,
|
||||||
confidence=image_result.confidence,
|
confidence=image_result.confidence,
|
||||||
categories=categories,
|
categories=categories,
|
||||||
explanation=image_result.description,
|
explanation=explanation,
|
||||||
suggested_action="delete",
|
suggested_action=_get_action_for_nsfw(image_result.nsfw_category),
|
||||||
|
severity_override=severity_override,
|
||||||
)
|
)
|
||||||
await self._handle_ai_result(message, result, "Image Analysis")
|
await self._handle_ai_result(message, result, "Image Analysis")
|
||||||
return
|
return
|
||||||
|
|||||||
@@ -38,23 +38,43 @@ Respond in this exact JSON format:
|
|||||||
|
|
||||||
Be balanced - flag genuinely problematic content but allow normal conversation, jokes, and mild language. Consider context."""
|
Be balanced - flag genuinely problematic content but allow normal conversation, jokes, and mild language. Consider context."""
|
||||||
|
|
||||||
IMAGE_ANALYSIS_PROMPT = """Analyze this image for content moderation purposes. Check for:
|
IMAGE_ANALYSIS_PROMPT = """Analyze this image for content moderation. Classify NSFW content with precision.
|
||||||
- NSFW content (nudity, sexual content)
|
|
||||||
- Violence or gore
|
NSFW CATEGORIES (in order of severity):
|
||||||
- Disturbing or shocking content
|
- none: Safe content, no sexual elements
|
||||||
- Any content inappropriate for a general audience
|
- suggestive: Revealing clothing, provocative poses, cleavage, but no nudity
|
||||||
|
- partial_nudity: Underwear visible, partial exposure, censored nudity
|
||||||
|
- nudity: Full nudity visible (breasts, genitals, buttocks) but no sexual acts
|
||||||
|
- explicit: Sexual acts, pornographic content, genitalia in sexual context
|
||||||
|
|
||||||
|
DETECTION CRITERIA:
|
||||||
|
- Focus on human bodies; anime/artwork should be judged by same standards
|
||||||
|
- Swimwear in beach/pool context = suggestive at most
|
||||||
|
- Artistic nudity (classical art, sculptures) = nudity but note in description
|
||||||
|
- Medical/educational content = note context in description
|
||||||
|
|
||||||
|
ALSO CHECK FOR:
|
||||||
|
- Violence or gore (blood, injuries, weapons used violently)
|
||||||
|
- Disturbing content (shock imagery, extreme content)
|
||||||
|
|
||||||
Respond in this exact JSON format:
|
Respond in this exact JSON format:
|
||||||
{
|
{
|
||||||
"is_nsfw": true/false,
|
"is_nsfw": true/false,
|
||||||
|
"nsfw_category": "none|suggestive|partial_nudity|nudity|explicit",
|
||||||
|
"nsfw_severity": 0-100,
|
||||||
"is_violent": true/false,
|
"is_violent": true/false,
|
||||||
"is_disturbing": true/false,
|
"is_disturbing": true/false,
|
||||||
"confidence": 0.0-1.0,
|
"confidence": 0.0-1.0,
|
||||||
"description": "Brief description of the image",
|
"description": "Brief description including context",
|
||||||
"categories": ["category1", "category2"]
|
"categories": ["category1", "category2"]
|
||||||
}
|
}
|
||||||
|
|
||||||
Be accurate but not overly sensitive - artistic nudity or mild violence in appropriate contexts may be acceptable."""
|
NSFW SEVERITY GUIDELINES:
|
||||||
|
- none: 0
|
||||||
|
- suggestive: 20-35
|
||||||
|
- partial_nudity: 40-55
|
||||||
|
- nudity: 60-75
|
||||||
|
- explicit: 80-100"""
|
||||||
|
|
||||||
PHISHING_ANALYSIS_PROMPT = """Analyze this URL and message context for phishing or scam indicators.
|
PHISHING_ANALYSIS_PROMPT = """Analyze this URL and message context for phishing or scam indicators.
|
||||||
|
|
||||||
@@ -97,6 +117,7 @@ class AnthropicProvider(AIProvider):
|
|||||||
|
|
||||||
async def _call_api(self, system: str, user_content: Any, max_tokens: int = 500) -> str:
|
async def _call_api(self, system: str, user_content: Any, max_tokens: int = 500) -> str:
|
||||||
"""Make an API call to Claude."""
|
"""Make an API call to Claude."""
|
||||||
|
|
||||||
async def _request() -> str:
|
async def _request() -> str:
|
||||||
message = await self.client.messages.create(
|
message = await self.client.messages.create(
|
||||||
model=self.model,
|
model=self.model,
|
||||||
@@ -180,11 +201,29 @@ class AnthropicProvider(AIProvider):
|
|||||||
|
|
||||||
import aiohttp
|
import aiohttp
|
||||||
|
|
||||||
sensitivity_note = ""
|
|
||||||
if sensitivity < 30:
|
if sensitivity < 30:
|
||||||
sensitivity_note = "\n\nBe lenient - only flag explicit content."
|
sensitivity_note = """
|
||||||
|
|
||||||
|
SENSITIVITY: LENIENT
|
||||||
|
- Allow suggestive content (swimwear, revealing clothing)
|
||||||
|
- Only flag partial_nudity and above as NSFW
|
||||||
|
- Consider artistic/educational context favorably
|
||||||
|
- Set is_nsfw=false for suggestive content"""
|
||||||
elif sensitivity > 70:
|
elif sensitivity > 70:
|
||||||
sensitivity_note = "\n\nBe strict - flag suggestive content as well."
|
sensitivity_note = """
|
||||||
|
|
||||||
|
SENSITIVITY: STRICT
|
||||||
|
- Flag suggestive content as NSFW (is_nsfw=true)
|
||||||
|
- No tolerance for any nudity regardless of context
|
||||||
|
- Provocative poses should be flagged
|
||||||
|
- Lower threshold for nsfw_severity scores"""
|
||||||
|
else:
|
||||||
|
sensitivity_note = """
|
||||||
|
|
||||||
|
SENSITIVITY: BALANCED
|
||||||
|
- Allow normal swimwear/fashion (is_nsfw=false for suggestive)
|
||||||
|
- Flag partial_nudity and above as NSFW
|
||||||
|
- Consider context for edge cases"""
|
||||||
|
|
||||||
system = IMAGE_ANALYSIS_PROMPT + sensitivity_note
|
system = IMAGE_ANALYSIS_PROMPT + sensitivity_note
|
||||||
|
|
||||||
@@ -229,6 +268,8 @@ class AnthropicProvider(AIProvider):
|
|||||||
confidence=float(data.get("confidence", 0.0)),
|
confidence=float(data.get("confidence", 0.0)),
|
||||||
description=data.get("description", ""),
|
description=data.get("description", ""),
|
||||||
categories=data.get("categories", []),
|
categories=data.get("categories", []),
|
||||||
|
nsfw_category=data.get("nsfw_category", "none"),
|
||||||
|
nsfw_severity=int(data.get("nsfw_severity", 0)),
|
||||||
)
|
)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|||||||
@@ -23,6 +23,16 @@ class ContentCategory(str, Enum):
|
|||||||
MISINFORMATION = "misinformation"
|
MISINFORMATION = "misinformation"
|
||||||
|
|
||||||
|
|
||||||
|
class NSFWCategory(str, Enum):
|
||||||
|
"""NSFW content subcategories with increasing severity."""
|
||||||
|
|
||||||
|
NONE = "none"
|
||||||
|
SUGGESTIVE = "suggestive" # Revealing clothing, provocative poses
|
||||||
|
PARTIAL_NUDITY = "partial_nudity" # Partial exposure, underwear
|
||||||
|
NUDITY = "nudity" # Full nudity without sexual acts
|
||||||
|
EXPLICIT = "explicit" # Sexual acts, pornographic content
|
||||||
|
|
||||||
|
|
||||||
_T = TypeVar("_T")
|
_T = TypeVar("_T")
|
||||||
|
|
||||||
|
|
||||||
@@ -90,6 +100,7 @@ class ModerationResult:
|
|||||||
categories: list[ContentCategory] = field(default_factory=list)
|
categories: list[ContentCategory] = field(default_factory=list)
|
||||||
explanation: str = ""
|
explanation: str = ""
|
||||||
suggested_action: Literal["none", "warn", "delete", "timeout", "ban"] = "none"
|
suggested_action: Literal["none", "warn", "delete", "timeout", "ban"] = "none"
|
||||||
|
severity_override: int | None = None # Direct severity for NSFW images
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def severity(self) -> int:
|
def severity(self) -> int:
|
||||||
@@ -97,6 +108,10 @@ class ModerationResult:
|
|||||||
if not self.is_flagged:
|
if not self.is_flagged:
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
# Use override if provided (e.g., from NSFW image analysis)
|
||||||
|
if self.severity_override is not None:
|
||||||
|
return min(self.severity_override, 100)
|
||||||
|
|
||||||
# Base severity from confidence
|
# Base severity from confidence
|
||||||
severity = int(self.confidence * 50)
|
severity = int(self.confidence * 50)
|
||||||
|
|
||||||
@@ -133,6 +148,8 @@ class ImageAnalysisResult:
|
|||||||
confidence: float = 0.0
|
confidence: float = 0.0
|
||||||
description: str = ""
|
description: str = ""
|
||||||
categories: list[str] = field(default_factory=list)
|
categories: list[str] = field(default_factory=list)
|
||||||
|
nsfw_category: str = "none" # Maps to NSFWCategory values
|
||||||
|
nsfw_severity: int = 0 # 0-100 specific NSFW severity score
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
|
|||||||
@@ -42,6 +42,7 @@ class OpenAIProvider(AIProvider):
|
|||||||
max_tokens: int = 500,
|
max_tokens: int = 500,
|
||||||
) -> str:
|
) -> str:
|
||||||
"""Make an API call to OpenAI."""
|
"""Make an API call to OpenAI."""
|
||||||
|
|
||||||
async def _request() -> str:
|
async def _request() -> str:
|
||||||
response = await self.client.chat.completions.create(
|
response = await self.client.chat.completions.create(
|
||||||
model=self.model,
|
model=self.model,
|
||||||
@@ -79,6 +80,7 @@ class OpenAIProvider(AIProvider):
|
|||||||
"""Analyze text content for policy violations."""
|
"""Analyze text content for policy violations."""
|
||||||
# First, use OpenAI's built-in moderation API for quick check
|
# First, use OpenAI's built-in moderation API for quick check
|
||||||
try:
|
try:
|
||||||
|
|
||||||
async def _moderate() -> Any:
|
async def _moderate() -> Any:
|
||||||
return await self.client.moderations.create(input=content)
|
return await self.client.moderations.create(input=content)
|
||||||
|
|
||||||
@@ -140,23 +142,47 @@ class OpenAIProvider(AIProvider):
|
|||||||
sensitivity: int = 50,
|
sensitivity: int = 50,
|
||||||
) -> ImageAnalysisResult:
|
) -> ImageAnalysisResult:
|
||||||
"""Analyze an image for NSFW or inappropriate content."""
|
"""Analyze an image for NSFW or inappropriate content."""
|
||||||
system = """Analyze this image for content moderation. Respond in JSON format:
|
system = """Analyze this image for content moderation. Classify NSFW content with precision.
|
||||||
|
|
||||||
|
NSFW CATEGORIES (in order of severity):
|
||||||
|
- none: Safe content, no sexual elements
|
||||||
|
- suggestive: Revealing clothing, provocative poses, cleavage, but no nudity
|
||||||
|
- partial_nudity: Underwear visible, partial exposure, censored nudity
|
||||||
|
- nudity: Full nudity visible (breasts, genitals, buttocks) but no sexual acts
|
||||||
|
- explicit: Sexual acts, pornographic content, genitalia in sexual context
|
||||||
|
|
||||||
|
DETECTION CRITERIA:
|
||||||
|
- Focus on human bodies; anime/artwork should be judged by same standards
|
||||||
|
- Swimwear in beach/pool context = suggestive at most
|
||||||
|
- Artistic nudity (classical art, sculptures) = nudity but note in description
|
||||||
|
|
||||||
|
ALSO CHECK FOR:
|
||||||
|
- Violence or gore (blood, injuries, weapons used violently)
|
||||||
|
- Disturbing content (shock imagery, extreme content)
|
||||||
|
|
||||||
|
Respond in JSON format:
|
||||||
{
|
{
|
||||||
"is_nsfw": true/false,
|
"is_nsfw": true/false,
|
||||||
|
"nsfw_category": "none|suggestive|partial_nudity|nudity|explicit",
|
||||||
|
"nsfw_severity": 0-100,
|
||||||
"is_violent": true/false,
|
"is_violent": true/false,
|
||||||
"is_disturbing": true/false,
|
"is_disturbing": true/false,
|
||||||
"confidence": 0.0-1.0,
|
"confidence": 0.0-1.0,
|
||||||
"description": "Brief description",
|
"description": "Brief description including context",
|
||||||
"categories": ["category1"]
|
"categories": ["category1"]
|
||||||
}"""
|
}
|
||||||
|
|
||||||
|
NSFW SEVERITY GUIDELINES: none=0, suggestive=20-35, partial_nudity=40-55, nudity=60-75, explicit=80-100"""
|
||||||
|
|
||||||
sensitivity_note = ""
|
|
||||||
if sensitivity < 30:
|
if sensitivity < 30:
|
||||||
sensitivity_note = " Be lenient - only flag explicit content."
|
sensitivity_note = " SENSITIVITY: LENIENT - Allow suggestive content, only flag partial_nudity and above, set is_nsfw=false for suggestive."
|
||||||
elif sensitivity > 70:
|
elif sensitivity > 70:
|
||||||
sensitivity_note = " Be strict - flag suggestive content."
|
sensitivity_note = " SENSITIVITY: STRICT - Flag suggestive content as NSFW, no tolerance for any nudity regardless of context."
|
||||||
|
else:
|
||||||
|
sensitivity_note = " SENSITIVITY: BALANCED - Allow normal swimwear/fashion, flag partial_nudity and above."
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|
||||||
async def _request() -> Any:
|
async def _request() -> Any:
|
||||||
return await self.client.chat.completions.create(
|
return await self.client.chat.completions.create(
|
||||||
model="gpt-4o-mini", # Use vision-capable model
|
model="gpt-4o-mini", # Use vision-capable model
|
||||||
@@ -189,6 +215,8 @@ class OpenAIProvider(AIProvider):
|
|||||||
confidence=float(data.get("confidence", 0.0)),
|
confidence=float(data.get("confidence", 0.0)),
|
||||||
description=data.get("description", ""),
|
description=data.get("description", ""),
|
||||||
categories=data.get("categories", []),
|
categories=data.get("categories", []),
|
||||||
|
nsfw_category=data.get("nsfw_category", "none"),
|
||||||
|
nsfw_severity=int(data.get("nsfw_severity", 0)),
|
||||||
)
|
)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|||||||
Reference in New Issue
Block a user