diff --git a/CLAUDE.md b/CLAUDE.md index 8d484b8..e57e5f3 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -4,7 +4,7 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co ## Project Overview -GuardDen is a Discord moderation bot built with discord.py, PostgreSQL, and optional AI integration (Claude/OpenAI). Self-hosted with Docker support. +GuardDen is a minimal, cost-conscious Discord moderation bot focused on spam detection and NSFW image filtering. Built with discord.py, PostgreSQL, and optional AI integration (Claude/OpenAI) for image analysis only. Self-hosted with Docker support. ## Commands @@ -19,7 +19,7 @@ python -m guardden pytest # Run single test -pytest tests/test_verification.py::TestVerificationService::test_verify_correct +pytest tests/test_automod.py::TestAutomodService::test_spam_detection # Lint and format ruff check src tests @@ -30,73 +30,48 @@ mypy src # Docker deployment docker compose up -d + +# Database migrations +alembic upgrade head ``` ## Architecture - `src/guardden/bot.py` - Main bot class (`GuardDen`) extending `commands.Bot`, manages lifecycle and services - `src/guardden/config.py` - Pydantic settings loaded from environment variables (prefix: `GUARDDEN_`) -- `src/guardden/models/` - SQLAlchemy 2.0 async models for PostgreSQL -- `src/guardden/services/` - Business logic (database, guild config, automod, AI, verification, rate limiting) -- `src/guardden/cogs/` - Discord command groups (events, moderation, admin, automod, ai_moderation, verification) +- `src/guardden/models/guild.py` - SQLAlchemy 2.0 async models for guilds and settings +- `src/guardden/services/` - Business logic (database, guild config, automod, AI, rate limiting) +- `src/guardden/cogs/` - Discord command groups (automod, ai_moderation, owner) +- `config.yml` - Single YAML file for bot configuration ## Key Patterns - All database operations use async SQLAlchemy with `asyncpg` -- Guild configurations are cached in `GuildConfigService._cache` +- Guild configurations loaded from single `config.yml` file (not per-guild) - Discord snowflake IDs stored as `BigInteger` in PostgreSQL -- Moderation actions logged to `ModerationLog` table with automatic strike escalation -- Environment variables: `GUARDDEN_DISCORD_TOKEN`, `GUARDDEN_DATABASE_URL` +- No moderation logging or strike system +- Environment variables: `GUARDDEN_DISCORD_TOKEN`, `GUARDDEN_DATABASE_URL`, AI keys ## Automod System -- `AutomodService` in `services/automod.py` handles rule-based content filtering -- Checks run in order: banned words → scam links → spam → invite links +- `AutomodService` in `services/automod.py` handles spam detection +- Checks: message rate limit → duplicate messages → mass mentions - Spam tracking uses per-guild, per-user trackers with automatic cleanup -- Scam detection uses compiled regex patterns in `SCAM_PATTERNS` list - Results return `AutomodResult` dataclass with actions to take -- **Whitelist**: Users in `GuildSettings.whitelisted_user_ids` bypass ALL automod checks -- Users with "Manage Messages" permission also bypass automod +- Everyone gets moderated (no whitelist, no bypass for permissions) ## AI Moderation System - `services/ai/` contains provider abstraction and implementations -- `AIProvider` base class defines interface: `moderate_text()`, `analyze_image()`, `analyze_phishing()` +- `AIProvider` base class defines interface: `analyze_image()` only - `AnthropicProvider` and `OpenAIProvider` implement the interface - `NullProvider` used when AI is disabled (returns empty results) - Factory pattern via `create_ai_provider(provider, api_key)` -- `ModerationResult` includes severity scoring based on confidence + category weights -- Sensitivity setting (0-100) adjusts thresholds per guild -- **NSFW-Only Filtering** (default: `True`): When enabled, only sexual content is filtered; violence, harassment, etc. are allowed -- Filtering controlled by `nsfw_only_filtering` field in `GuildSettings` -- **Whitelist**: Users in `GuildSettings.whitelisted_user_ids` bypass ALL AI moderation checks - -## Verification System - -- `VerificationService` in `services/verification.py` manages challenges -- Challenge types: button, captcha, math, emoji (via `ChallengeGenerator` classes) -- `PendingVerification` tracks user challenges with expiry and attempt limits -- Discord UI components in `cogs/verification.py`: `VerifyButton`, `EmojiButton`, `CaptchaModal` -- Background task cleans up expired verifications every 5 minutes - -## Rate Limiting System - -- `RateLimiter` in `services/ratelimit.py` provides general-purpose rate limiting -- Scopes: USER (global), MEMBER (per-guild), CHANNEL, GUILD -- `@ratelimit()` decorator for easy command rate limiting -- `get_rate_limiter()` returns singleton instance -- Default limits configured for commands, moderation, verification, messages - -## Notification System - -- `utils/notifications.py` contains `send_moderation_notification()` utility -- Handles sending moderation warnings to users with DM → in-channel fallback -- **In-Channel Warnings** (default: `False`): Optional PUBLIC channel messages when DMs fail -- **IMPORTANT**: In-channel messages are PUBLIC, visible to all users (Discord API limitation) -- Temporary messages auto-delete after 10 seconds to minimize clutter -- Used by automod, AI moderation, and manual moderation commands -- Controlled by `send_in_channel_warnings` field in `GuildSettings` -- Disabled by default for privacy reasons +- `ImageAnalysisResult` includes NSFW categories, severity, confidence +- Sensitivity setting (0-100) adjusts thresholds +- **NSFW-Only Filtering** (default: `True`): Only sexual content is filtered +- **Cost Controls**: Rate limiting, deduplication, file size limits, max images per message +- `AIRateLimiter` in `services/ai_rate_limiter.py` tracks usage ## Adding New Cogs @@ -110,10 +85,3 @@ docker compose up -d 2. Implement `AIProvider` abstract class 3. Add to factory in `services/ai/factory.py` 4. Add config option in `config.py` - -## Adding New Challenge Type - -1. Create new `ChallengeGenerator` subclass in `services/verification.py` -2. Add to `ChallengeType` enum -3. Register in `VerificationService._generators` -4. Create corresponding UI components in `cogs/verification.py` if needed diff --git a/README.md b/README.md index 76e5b94..9e8787c 100644 --- a/README.md +++ b/README.md @@ -1,41 +1,19 @@ # GuardDen -GuardDen is a comprehensive Discord moderation bot designed to protect your community while maintaining a warm, welcoming environment. Built with privacy and self-hosting in mind, GuardDen combines AI-powered content filtering with traditional moderation tools to create a safe space for your members. +A lightweight, cost-conscious Discord moderation bot focused on essential protection. Built for self-hosting with minimal resource usage and AI costs. ## Features -### Core Moderation -- **Warn, Kick, Ban, Timeout** - Standard moderation commands with logging -- **Strike System** - Configurable point-based system with automatic escalation -- **Moderation History** - Track all actions taken against users -- **Bulk Message Deletion** - Purge up to 100 messages at once - -### Automod -- **Banned Words Filter** - Block words/phrases with regex support -- **Scam Detection** - Automatic detection of phishing/scam links +### Spam Detection - **Anti-Spam** - Rate limiting, duplicate detection, mass mention protection -- **Link Filtering** - Block Discord invites and suspicious URLs +- **Automatic Actions** - Message deletion and user timeout for spam violations -### AI Moderation -- **Text Analysis** - AI-powered content moderation using Claude or GPT -- **NSFW Image Detection** - Automatic flagging of inappropriate images -- **NSFW-Only Filtering** - Enabled by default - only filters sexual content, allows violence/harassment -- **Phishing Analysis** - AI-enhanced detection of scam URLs -- **Configurable Sensitivity** - Adjust strictness per server (0-100) -- **Public In-Channel Warnings** - Optional: sends temporary public channel messages when users have DMs disabled - -### Verification System -- **Multiple Challenge Types** - Button, captcha, math problems, emoji selection -- **Automatic New Member Verification** - Challenge users on join -- **Configurable Verified Role** - Auto-assign role on successful verification -- **Rate Limited** - Prevents verification spam - -### Logging -- Member joins/leaves -- Message edits and deletions -- Voice channel activity -- Ban/unban events -- All moderation actions +### AI-Powered NSFW Image Detection +- **Smart Image Analysis** - AI-powered detection of inappropriate images using Claude or GPT +- **Cost Controls** - Conservative rate limits (25 checks/hour/guild by default) +- **Embed Support** - Optional checking of Discord GIF embeds +- **NSFW Video Domain Blocking** - Block known NSFW video domains +- **Configurable Sensitivity** - Adjust strictness (0-100) ## Quick Start @@ -55,32 +33,22 @@ GuardDen is a comprehensive Discord moderation bot designed to protect your comm - Disable **Public Bot** if you only want yourself to add it - Copy the **Token** (click "Reset Token") - this is your `GUARDDEN_DISCORD_TOKEN` -5. **Enable Privileged Gateway Intents** (all three required): - - **Presence Intent** - for user status tracking - - **Server Members Intent** - for member join/leave events, verification - - **Message Content Intent** - for reading messages (automod, AI moderation) +5. **Enable Privileged Gateway Intents** (required): + - **Message Content Intent** - for reading messages (spam detection, image checking) 6. **Generate Invite URL** - Go to **OAuth2** > **URL Generator**: **Scopes:** - `bot` - - `applications.commands` **Bot Permissions:** - - Manage Roles - - Kick Members - - Ban Members - Moderate Members (timeout) - - Manage Channels - View Channels - Send Messages - Manage Messages - - Embed Links - - Attach Files - Read Message History - - Add Reactions - Or use permission integer: `1239943348294` + Or use permission integer: `275415089216` 7. Use the generated URL to invite the bot to your server @@ -134,480 +102,152 @@ GuardDen is a comprehensive Discord moderation bot designed to protect your comm ## Configuration -GuardDen now supports **file-based configuration** as the primary method for managing bot settings. This replaces Discord commands for configuration, providing better version control, easier management, and more reliable deployments. +GuardDen uses a **single YAML configuration file** (`config.yml`) for managing all bot settings across all guilds. -### File-Based Configuration (Recommended) +### Configuration File (`config.yml`) -#### Directory Structure -``` -config/ -├── guilds/ -│ ├── guild-123456789.yml # Per-server configuration -│ ├── guild-987654321.yml -│ └── default-template.yml # Template for new servers -├── wordlists/ -│ ├── banned-words.yml # Custom banned words -│ ├── domain-allowlists.yml # Allowed domains whitelist -│ └── external-sources.yml # Managed wordlist sources -├── schemas/ -│ ├── guild-schema.yml # Configuration validation -│ └── wordlists-schema.yml -└── templates/ - └── guild-default.yml # Default configuration template -``` +Create a `config.yml` file in your project root: -#### Quick Start with File Configuration - -1. **Create your first server configuration:** - ```bash - python -m guardden.cli.config guild create 123456789012345678 "My Discord Server" - ``` - -2. **Edit the configuration file:** - ```bash - nano config/guilds/guild-123456789012345678.yml - ``` - -3. **Customize settings (example):** - ```yaml - # Basic server information - guild_id: 123456789012345678 - name: "My Discord Server" - - settings: - # AI Moderation - ai_moderation: - enabled: true - sensitivity: 80 # 0-100 (higher = stricter) - nsfw_only_filtering: true # Only block sexual content, allow violence - - # Automod settings - automod: - message_rate_limit: 5 # Max messages per 5 seconds - scam_allowlist: - - "discord.com" - - "github.com" - ``` - -4. **Validate your configuration:** - ```bash - python -m guardden.cli.config guild validate 123456789012345678 - ``` - -5. **Start the bot** (configurations auto-reload): - ```bash - python -m guardden - ``` - -#### Configuration Management CLI - -**Guild Management:** -```bash -# List all configured servers -python -m guardden.cli.config guild list - -# Create new server configuration -python -m guardden.cli.config guild create "Server Name" - -# Edit specific settings -python -m guardden.cli.config guild edit ai_moderation.sensitivity 75 -python -m guardden.cli.config guild edit ai_moderation.nsfw_only_filtering true - -# Validate configurations -python -m guardden.cli.config guild validate -python -m guardden.cli.config guild validate - -# Backup configuration -python -m guardden.cli.config guild backup -``` - -**Migration from Discord Commands:** -```bash -# Export existing Discord command settings to files -python -m guardden.cli.config migrate from-database - -# Verify migration was successful -python -m guardden.cli.config migrate verify -``` - -**Wordlist Management:** -```bash -# View wordlist status -python -m guardden.cli.config wordlist info - -# View available templates -python -m guardden.cli.config template info -``` - -#### Key Configuration Options - -**AI Moderation Settings:** ```yaml -ai_moderation: - enabled: true # Enable AI content analysis - sensitivity: 80 # 0-100 scale (higher = stricter) - confidence_threshold: 0.7 # 0.0-1.0 confidence required - nsfw_only_filtering: true # true = only sexual content (DEFAULT), false = all content - log_only: false # true = log only, false = take action +bot: + prefix: "!" + owner_ids: + - 123456789012345678 # Your Discord user ID -notifications: - send_in_channel_warnings: false # Send temporary PUBLIC channel messages when DMs fail (DEFAULT: false) -``` - -**NSFW-Only Filtering Guide (Default: Enabled):** -- `true` = Only block sexual/nude content, allow violence and other content types **(DEFAULT)** -- `false` = Block ALL inappropriate content (sexual, violence, harassment, hate speech) - -**Public In-Channel Warnings (Default: Disabled):** -- **IMPORTANT**: These messages are PUBLIC and visible to everyone in the channel, NOT private -- When enabled and a user has DMs disabled, sends a temporary public message in the channel -- Messages auto-delete after 10 seconds to minimize clutter -- **Privacy Warning**: The user's violation and reason will be visible to all users for 10 seconds -- Set to `true` only if you prefer public transparency over privacy - -**Automod Configuration:** -```yaml +# Spam detection settings automod: - message_rate_limit: 5 # Max messages per time window - message_rate_window: 5 # Time window in seconds - duplicate_threshold: 3 # Duplicate messages to trigger - scam_allowlist: # Domains that bypass scam detection - - "discord.com" - - "github.com" + enabled: true + anti_spam_enabled: true + message_rate_limit: 5 # Max messages per window + message_rate_window: 5 # Window in seconds + duplicate_threshold: 3 # Duplicates to trigger + mention_limit: 5 # Max mentions per message + mention_rate_limit: 10 # Max mentions per window + mention_rate_window: 60 # Window in seconds + +# AI moderation settings +ai_moderation: + enabled: true + sensitivity: 80 # 0-100 (higher = stricter) + nsfw_only_filtering: true # Only filter sexual content + max_checks_per_hour_per_guild: 25 # Cost control + max_checks_per_user_per_hour: 5 # Cost control + max_images_per_message: 2 # Analyze max 2 images/msg + max_image_size_mb: 3 # Skip images > 3MB + check_embed_images: true # Check Discord GIF embeds + check_video_thumbnails: false # Skip video thumbnails + url_image_check_enabled: false # Skip URL image downloads + +# Known NSFW video domains (auto-block) +nsfw_video_domains: + - pornhub.com + - xvideos.com + - xnxx.com + - redtube.com + - youporn.com ``` -**Banned Words Management:** -Edit `config/wordlists/banned-words.yml`: -```yaml -global_patterns: - - pattern: "badword" - action: delete - is_regex: false - category: profanity - -guild_patterns: - 123456789: # Specific server overrides - - pattern: "server-specific-rule" - action: warn - override_global: false -``` +### Key Configuration Options -#### Hot-Reloading +**AI Moderation (NSFW Image Detection):** +- `sensitivity`: 0-100 scale (higher = stricter detection) +- `nsfw_only_filtering`: Only flag sexual content (violence/harassment allowed) +- `max_checks_per_hour_per_guild`: Cost control - limits AI API calls +- `check_embed_images`: Whether to analyze Discord GIF embeds -Configuration changes are automatically detected and applied without restarting the bot: -- ✅ Edit YAML files directly -- ✅ Changes apply within seconds -- ✅ Invalid configs are rejected with error logs -- ✅ Automatic rollback on errors +**Spam Detection:** +- `message_rate_limit`: Max messages allowed per window +- `duplicate_threshold`: How many duplicate messages trigger action +- `mention_limit`: Max @mentions allowed per message + +**Cost Controls:** +The bot includes multiple layers of cost control: +- Rate limiting (25 AI checks/hour/guild, 5/hour/user by default) +- Image deduplication (tracks last 1000 analyzed messages) +- File size limits (skip images > 3MB) +- Max images per message (analyze max 2 images) +- Optional embed checking (disable to save costs) ### Environment Variables | Variable | Description | Default | |----------|-------------|---------| -| `GUARDDEN_DISCORD_TOKEN` | Your Discord bot token | Required | -| `GUARDDEN_DISCORD_PREFIX` | Default command prefix | `!` | -| `GUARDDEN_ALLOWED_GUILDS` | Comma-separated guild allowlist | (empty = all) | -| `GUARDDEN_OWNER_IDS` | Comma-separated owner user IDs | (empty = admins) | +| `GUARDDEN_DISCORD_TOKEN` | Your Discord bot token | **Required** | | `GUARDDEN_DATABASE_URL` | PostgreSQL connection URL | `postgresql://guardden:guardden@localhost:5432/guardden` | | `GUARDDEN_LOG_LEVEL` | Logging level | `INFO` | | `GUARDDEN_AI_PROVIDER` | AI provider (anthropic/openai/none) | `none` | | `GUARDDEN_ANTHROPIC_API_KEY` | Anthropic API key (if using Claude) | - | | `GUARDDEN_OPENAI_API_KEY` | OpenAI API key (if using GPT) | - | -| `GUARDDEN_WORDLIST_ENABLED` | Enable managed wordlist sync | `true` | -| `GUARDDEN_WORDLIST_UPDATE_HOURS` | Managed wordlist sync interval | `168` | -| `GUARDDEN_WORDLIST_SOURCES` | JSON array of wordlist sources | (empty = defaults) | -### Per-Guild Settings +## Owner Commands -Each server can be configured via YAML files in `config/guilds/`: - -**General Settings:** -- Command prefix and locale -- Channel IDs (log, moderation, welcome) -- Role IDs (mute, verified, moderator) - -**Content Moderation:** -- AI moderation (enabled, sensitivity, NSFW-only mode) -- Automod thresholds and rate limits -- Banned words and domain allowlists -- Strike system and escalation actions - -**Member Verification:** -- Verification challenges (button, captcha, math, emoji) -- Auto-role assignment - -**All settings support hot-reloading** - edit files and changes apply immediately! - -## Commands - -> **Note:** Configuration commands (`!config`, `!ai`, `!automod`, etc.) have been replaced with file-based configuration. See the [Configuration](#configuration) section above for managing settings via YAML files and the CLI tool. - -### Moderation - -| Command | Permission | Description | -|---------|------------|-------------| -| `!warn [reason]` | Kick Members | Warn a user | -| `!strike [points] [reason]` | Kick Members | Add strikes to a user | -| `!strikes ` | Kick Members | View user's strikes | -| `!timeout [reason]` | Moderate Members | Timeout a user (e.g., 1h, 30m, 7d) | -| `!untimeout ` | Moderate Members | Remove timeout | -| `!kick [reason]` | Kick Members | Kick a user | -| `!ban [reason]` | Ban Members | Ban a user | -| `!unban [reason]` | Ban Members | Unban a user by ID | -| `!purge ` | Manage Messages | Delete multiple messages (max 100) | -| `!modlogs ` | Kick Members | View moderation history | - -### Configuration Management - -Configuration is now managed via **YAML files** instead of Discord commands. Use the CLI tool: - -```bash -# Configuration Management CLI -python -m guardden.cli.config guild create "Server Name" -python -m guardden.cli.config guild list -python -m guardden.cli.config guild edit -python -m guardden.cli.config guild validate [guild_id] - -# Migration from old Discord commands -python -m guardden.cli.config migrate from-database -python -m guardden.cli.config migrate verify - -# Wordlist management -python -m guardden.cli.config wordlist info -``` - -**Read-only Status Commands (Still Available):** +GuardDen includes a minimal set of owner-only commands for bot management: | Command | Description | |---------|-------------| -| `!config` | View current configuration (read-only) | -| `!ai` | View AI moderation settings (read-only) | -| `!automod` | View automod status (read-only) | -| `!bannedwords` | List banned words (read-only) | +| `!status` | Show bot status (uptime, guilds, latency, AI provider) | +| `!reload` | Reload all cogs | +| `!ping` | Check bot latency | -**Configuration Examples:** - -```bash -# Set AI sensitivity to 75 (0-100 scale) -python -m guardden.cli.config guild edit 123456789 ai_moderation.sensitivity 75 - -# Enable NSFW-only filtering (only block sexual content) -python -m guardden.cli.config guild edit 123456789 ai_moderation.nsfw_only_filtering true - -# Add domain to scam allowlist -Edit config/wordlists/domain-allowlists.yml - -# Add banned word pattern -Edit config/wordlists/banned-words.yml -``` - -### Whitelist Management (Admin only) - -| Command | Description | -|---------|-------------| -| `!whitelist` | View all whitelisted users | -| `!whitelist add @user` | Add a user to the whitelist (bypasses all moderation) | -| `!whitelist remove @user` | Remove a user from the whitelist | -| `!whitelist clear` | Clear the entire whitelist | - -**What is the whitelist?** -- Whitelisted users bypass **ALL** moderation checks (automod and AI moderation) -- Useful for trusted members, bots, or staff who need to post content that might trigger filters -- Users with "Manage Messages" permission are already exempt from moderation - -### Diagnostics (Admin only) - -| Command | Description | -|---------|-------------| -| `!health` | Check database and AI provider status | - -### Verification (Admin only) - -| Command | Description | -|---------|-------------| -| `!verify` | Request verification (for users) | -| `!verify setup` | View verification setup status | -| `!verify enable` | Enable verification for new members | -| `!verify disable` | Disable verification | -| `!verify role @role` | Set the verified role | -| `!verify type ` | Set verification type (button/captcha/math/emoji) | -| `!verify test [type]` | Test a verification challenge | -| `!verify reset @user` | Reset verification for a user | - -## CI (Gitea Actions) - -Workflows live under `.gitea/workflows/` and mirror the previous GitHub Actions -pipeline for linting, tests, and Docker builds. +**Note:** All configuration is done via the `config.yml` file. There are no in-Discord configuration commands. ## Project Structure ``` guardden/ ├── src/guardden/ -│ ├── bot.py # Main bot class -│ ├── config.py # Settings management -│ ├── cogs/ # Discord command groups -│ │ ├── admin.py # Configuration commands (read-only) -│ │ ├── ai_moderation.py # AI-powered moderation -│ │ ├── automod.py # Automatic moderation -│ │ ├── events.py # Event logging -│ │ ├── moderation.py # Moderation commands -│ │ └── verification.py # Member verification -│ ├── models/ # Database models -│ │ ├── guild.py # Guild settings, banned words -│ │ └── moderation.py # Logs, strikes, notes -│ ├── services/ # Business logic -│ │ ├── ai/ # AI provider implementations -│ │ ├── automod.py # Content filtering -│ │ ├── database.py # DB connections -│ │ ├── guild_config.py # Config caching -│ │ ├── file_config.py # File-based configuration system -│ │ ├── config_migration.py # Migration from DB to files -│ │ ├── ratelimit.py # Rate limiting -│ │ └── verification.py # Verification challenges -│ └── cli/ # Command-line tools -│ └── config.py # Configuration management CLI -├── config/ # File-based configuration -│ ├── guilds/ # Per-server configuration files -│ ├── wordlists/ # Banned words and allowlists -│ ├── schemas/ # Configuration validation schemas -│ └── templates/ # Configuration templates -├── tests/ # Test suite -├── migrations/ # Database migrations -├── docker-compose.yml # Docker deployment -├── pyproject.toml # Dependencies -├── README.md # This file -└── MIGRATION.md # Migration guide for file-based config +│ ├── bot.py # Main bot class +│ ├── config.py # Settings management +│ ├── cogs/ # Discord command groups +│ │ ├── automod.py # Spam detection +│ │ ├── ai_moderation.py # NSFW image detection +│ │ └── owner.py # Owner commands +│ ├── models/ # Database models +│ │ └── guild.py # Guild settings +│ ├── services/ # Business logic +│ │ ├── ai/ # AI provider implementations +│ │ ├── automod.py # Spam detection logic +│ │ ├── config_loader.py # YAML config loading +│ │ ├── ai_rate_limiter.py # AI cost control +│ │ ├── database.py # DB connections +│ │ └── guild_config.py # Config caching +│ └── __main__.py # Entry point +├── config.yml # Bot configuration +├── tests/ # Test suite +├── migrations/ # Database migrations +├── docker-compose.yml # Docker deployment +├── pyproject.toml # Dependencies +└── README.md # This file ``` -## Verification System +## How It Works -GuardDen includes a verification system to protect your server from bots and raids. +### Spam Detection +1. Bot monitors message rate per user +2. Detects duplicate messages +3. Counts @mentions (mass mention detection) +4. Violations result in message deletion + timeout -### Challenge Types +### NSFW Image Detection +1. Bot checks attachments and embeds for images +2. Applies rate limiting and deduplication +3. Downloads image and sends to AI provider +4. AI analyzes for NSFW content categories +5. Violations result in message deletion + timeout +6. Optionally checks known NSFW video domain links -| Type | Description | -|------|-------------| -| `button` | Simple button click (default, easiest) | -| `captcha` | Text-based captcha code entry | -| `math` | Solve a simple math problem | -| `emoji` | Select the correct emoji from options | +### Cost Management +The bot includes aggressive cost controls for AI usage: +- **Rate Limiting**: 25 checks/hour/guild, 5/hour/user (configurable) +- **Deduplication**: Skips recently analyzed message IDs +- **File Size Limits**: Skips images larger than 3MB +- **Max Images**: Analyzes max 2 images per message +- **Optional Features**: Embed checking, video thumbnails, URL downloads all controllable -### Setup - -1. Create a verified role in your server -2. Configure the role permissions (verified members get full access) -3. Set up verification: - ``` - !verify role @Verified - !verify type captcha - !verify enable - ``` - -### How It Works - -1. New member joins the server -2. Bot sends verification challenge via DM (or channel if DMs disabled) -3. Member completes the challenge -4. Bot assigns the verified role -5. Member gains access to the server - -## AI Moderation - -GuardDen supports AI-powered content moderation using either Anthropic's Claude or OpenAI's GPT models. - -### Setup - -1. Set the AI provider in your environment: - ```bash - GUARDDEN_AI_PROVIDER=anthropic # or "openai" - GUARDDEN_ANTHROPIC_API_KEY=sk-ant-... # if using Claude - GUARDDEN_OPENAI_API_KEY=sk-... # if using OpenAI - ``` - -2. Enable AI moderation per server: - ``` - !ai enable - !ai sensitivity 50 # 0=lenient, 100=strict - !ai nsfw true # Enable NSFW image detection - ``` - -### Content Categories - -The AI analyzes content for: -- **Harassment** - Personal attacks, bullying -- **Hate Speech** - Discrimination, slurs -- **Sexual Content** - Explicit material -- **Violence** - Threats, graphic content -- **Self-Harm** - Suicide/self-injury content -- **Scams** - Phishing, fraud attempts -- **Spam** - Promotional, low-quality content - -### How It Works - -1. Messages are analyzed by the AI provider -2. Results include confidence scores and severity ratings -3. Actions are taken based on guild sensitivity settings -4. All AI actions are logged to the mod log channel - -### NSFW-Only Filtering Mode (Enabled by Default) - -**Default Behavior:** -GuardDen is configured to only filter sexual/NSFW content by default. This allows communities to have mature discussions about violence, politics, and controversial topics while still maintaining a "safe for work" environment. - -**When enabled (DEFAULT):** -- ✅ **Blocked:** Sexual content, nude images, explicit material -- ❌ **Allowed:** Violence, harassment, hate speech, self-harm content - -**When disabled (strict mode):** -- ✅ **Blocked:** All inappropriate content categories - -**To change to strict mode:** -```yaml -# Edit config/guilds/guild-.yml -ai_moderation: - nsfw_only_filtering: false -``` - -This default is useful for: -- Gaming communities (violence in gaming discussions) -- Mature discussion servers (politics, news) -- Communities with specific content policies that allow violence but prohibit sexual material - -### Public In-Channel Warnings (Disabled by Default) - -**IMPORTANT PRIVACY NOTICE**: In-channel warnings are **PUBLIC** and visible to all users in the channel, NOT private messages. This is a Discord API limitation. - -When enabled and users have DMs disabled, moderation warnings are sent as temporary public messages in the channel where the violation occurred. - -**How it works:** -1. Bot tries to DM the user about the violation -2. If DM fails (user has DMs disabled): - - If `send_in_channel_warnings: true`: Sends a **PUBLIC** temporary message in the channel mentioning the user - - If `send_in_channel_warnings: false` (DEFAULT): Silent failure, no notification sent - - Message includes violation reason and any timeout information - - Message auto-deletes after 10 seconds -3. If DM succeeds, no channel message is sent - -**To enable in-channel warnings:** -```yaml -# Edit config/guilds/guild-.yml -notifications: - send_in_channel_warnings: true -``` - -**Considerations:** - -**Pros:** -- Users are always notified of moderation actions, even with DMs disabled -- Public transparency about what content is not allowed -- Educational for other members - -**Cons:** -- **NOT PRIVATE** - Violation details visible to all users for 10 seconds -- May embarrass users publicly -- Could expose sensitive moderation information -- Privacy-conscious communities may prefer silent failures +**Estimated Costs** (with defaults): +- Small server (< 100 users): ~$5-10/month +- Medium server (100-500 users): ~$15-25/month +- Large server (500+ users): Consider increasing rate limits or disabling embeds ## Development @@ -638,15 +278,9 @@ MIT License - see LICENSE file for details. - **Documentation**: See `docs/` directory - **Configuration Help**: Check `CLAUDE.md` for developer guidance -## Roadmap +## Future Considerations -- [x] AI-powered content moderation (Claude/OpenAI integration) -- [x] NSFW image detection -- [x] NSFW-only filtering mode (default) -- [x] Optional public in-channel warnings when DMs disabled -- [x] Verification/captcha system -- [x] Rate limiting -- [ ] Voice channel moderation -- [ ] Slash commands with true ephemeral messages -- [ ] Custom notification templates -- [ ] Advanced analytics dashboard +- [ ] Per-guild sensitivity settings (currently global) +- [ ] Slash commands +- [ ] Custom NSFW category thresholds +- [ ] Whitelist for trusted image sources diff --git a/migrations/versions/20260127_minimal_bot_cleanup.py b/migrations/versions/20260127_minimal_bot_cleanup.py new file mode 100644 index 0000000..c7ead4a --- /dev/null +++ b/migrations/versions/20260127_minimal_bot_cleanup.py @@ -0,0 +1,214 @@ +"""Minimal bot cleanup - remove unused tables and columns. + +Revision ID: 20260127_minimal_bot_cleanup +Revises: 20260125_add_whitelist +Create Date: 2026-01-27 00:00:00.000000 +""" + +import sqlalchemy as sa +from alembic import op +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision = "20260127_minimal_bot_cleanup" +down_revision = "20260125_add_whitelist" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + """Remove tables and columns not needed for minimal bot.""" + # Drop unused tables + op.drop_table("user_activity") + op.drop_table("message_activity") + op.drop_table("ai_checks") + op.drop_table("banned_words") + op.drop_table("user_notes") + op.drop_table("strikes") + op.drop_table("moderation_logs") + + # Drop unused columns from guild_settings + op.drop_column("guild_settings", "verification_enabled") + op.drop_column("guild_settings", "verification_type") + op.drop_column("guild_settings", "verified_role_id") + op.drop_column("guild_settings", "strike_actions") + op.drop_column("guild_settings", "mute_role_id") + op.drop_column("guild_settings", "mod_role_ids") + op.drop_column("guild_settings", "welcome_channel_id") + op.drop_column("guild_settings", "whitelisted_user_ids") + op.drop_column("guild_settings", "scam_allowlist") + op.drop_column("guild_settings", "send_in_channel_warnings") + op.drop_column("guild_settings", "ai_log_only") + op.drop_column("guild_settings", "ai_confidence_threshold") + op.drop_column("guild_settings", "log_channel_id") + op.drop_column("guild_settings", "mod_log_channel_id") + op.drop_column("guild_settings", "link_filter_enabled") + + +def downgrade() -> None: + """Restore removed tables and columns (WARNING: Data will be lost!).""" + # Restore guild_settings columns + op.add_column( + "guild_settings", + sa.Column("link_filter_enabled", sa.Boolean, nullable=False, default=False), + ) + op.add_column( + "guild_settings", + sa.Column("mod_log_channel_id", sa.BigInteger, nullable=True), + ) + op.add_column( + "guild_settings", + sa.Column("log_channel_id", sa.BigInteger, nullable=True), + ) + op.add_column( + "guild_settings", + sa.Column("ai_confidence_threshold", sa.Float, nullable=False, default=0.7), + ) + op.add_column( + "guild_settings", + sa.Column("ai_log_only", sa.Boolean, nullable=False, default=False), + ) + op.add_column( + "guild_settings", + sa.Column("send_in_channel_warnings", sa.Boolean, nullable=False, default=False), + ) + op.add_column( + "guild_settings", + sa.Column( + "scam_allowlist", + postgresql.JSONB().with_variant(sa.JSON(), "sqlite"), + nullable=False, + default=list, + ), + ) + op.add_column( + "guild_settings", + sa.Column( + "whitelisted_user_ids", + postgresql.JSONB().with_variant(sa.JSON(), "sqlite"), + nullable=False, + default=list, + ), + ) + op.add_column( + "guild_settings", + sa.Column("welcome_channel_id", sa.BigInteger, nullable=True), + ) + op.add_column( + "guild_settings", + sa.Column( + "mod_role_ids", + postgresql.JSONB().with_variant(sa.JSON(), "sqlite"), + nullable=False, + default=list, + ), + ) + op.add_column( + "guild_settings", + sa.Column("mute_role_id", sa.BigInteger, nullable=True), + ) + op.add_column( + "guild_settings", + sa.Column( + "strike_actions", + postgresql.JSONB().with_variant(sa.JSON(), "sqlite"), + nullable=False, + ), + ) + op.add_column( + "guild_settings", + sa.Column("verified_role_id", sa.BigInteger, nullable=True), + ) + op.add_column( + "guild_settings", + sa.Column("verification_type", sa.String(20), nullable=False, default="button"), + ) + op.add_column( + "guild_settings", + sa.Column("verification_enabled", sa.Boolean, nullable=False, default=False), + ) + + # Restore tables (empty, data lost) + op.create_table( + "moderation_logs", + sa.Column("id", sa.Integer, primary_key=True, autoincrement=True), + sa.Column("guild_id", sa.BigInteger, nullable=False), + sa.Column("user_id", sa.BigInteger, nullable=False), + sa.Column("moderator_id", sa.BigInteger, nullable=False), + sa.Column("action", sa.String(20), nullable=False), + sa.Column("reason", sa.Text, nullable=True), + sa.Column("created_at", sa.DateTime, nullable=False), + sa.ForeignKeyConstraint(["guild_id"], ["guilds.id"], ondelete="CASCADE"), + ) + + op.create_table( + "strikes", + sa.Column("id", sa.Integer, primary_key=True, autoincrement=True), + sa.Column("guild_id", sa.BigInteger, nullable=False), + sa.Column("user_id", sa.BigInteger, nullable=False), + sa.Column("reason", sa.Text, nullable=True), + sa.Column("created_at", sa.DateTime, nullable=False), + sa.ForeignKeyConstraint(["guild_id"], ["guilds.id"], ondelete="CASCADE"), + ) + + op.create_table( + "user_notes", + sa.Column("id", sa.Integer, primary_key=True, autoincrement=True), + sa.Column("guild_id", sa.BigInteger, nullable=False), + sa.Column("user_id", sa.BigInteger, nullable=False), + sa.Column("moderator_id", sa.BigInteger, nullable=False), + sa.Column("note", sa.Text, nullable=False), + sa.Column("created_at", sa.DateTime, nullable=False), + sa.ForeignKeyConstraint(["guild_id"], ["guilds.id"], ondelete="CASCADE"), + ) + + op.create_table( + "banned_words", + sa.Column("id", sa.Integer, primary_key=True, autoincrement=True), + sa.Column("guild_id", sa.BigInteger, nullable=False), + sa.Column("pattern", sa.Text, nullable=False), + sa.Column("is_regex", sa.Boolean, nullable=False, default=False), + sa.Column("action", sa.String(20), nullable=False, default="delete"), + sa.Column("reason", sa.Text, nullable=True), + sa.Column("source", sa.String(100), nullable=True), + sa.Column("category", sa.String(20), nullable=True), + sa.Column("managed", sa.Boolean, nullable=False, default=False), + sa.Column("added_by", sa.BigInteger, nullable=False), + sa.Column("created_at", sa.DateTime, nullable=False), + sa.Column("updated_at", sa.DateTime, nullable=False), + sa.ForeignKeyConstraint(["guild_id"], ["guilds.id"], ondelete="CASCADE"), + ) + + op.create_table( + "ai_checks", + sa.Column("id", sa.Integer, primary_key=True, autoincrement=True), + sa.Column("guild_id", sa.BigInteger, nullable=False), + sa.Column("user_id", sa.BigInteger, nullable=False), + sa.Column("message_id", sa.BigInteger, nullable=False), + sa.Column("check_type", sa.String(20), nullable=False), + sa.Column("flagged", sa.Boolean, nullable=False), + sa.Column("confidence", sa.Float, nullable=False), + sa.Column("created_at", sa.DateTime, nullable=False), + sa.ForeignKeyConstraint(["guild_id"], ["guilds.id"], ondelete="CASCADE"), + ) + + op.create_table( + "message_activity", + sa.Column("id", sa.Integer, primary_key=True, autoincrement=True), + sa.Column("guild_id", sa.BigInteger, nullable=False), + sa.Column("user_id", sa.BigInteger, nullable=False), + sa.Column("channel_id", sa.BigInteger, nullable=False), + sa.Column("message_count", sa.Integer, nullable=False), + sa.Column("date", sa.Date, nullable=False), + sa.ForeignKeyConstraint(["guild_id"], ["guilds.id"], ondelete="CASCADE"), + ) + + op.create_table( + "user_activity", + sa.Column("id", sa.Integer, primary_key=True, autoincrement=True), + sa.Column("guild_id", sa.BigInteger, nullable=False), + sa.Column("user_id", sa.BigInteger, nullable=False), + sa.Column("last_seen", sa.DateTime, nullable=False), + sa.Column("message_count", sa.Integer, nullable=False, default=0), + sa.ForeignKeyConstraint(["guild_id"], ["guilds.id"], ondelete="CASCADE"), + ) diff --git a/src/guardden/models/guild.py b/src/guardden/models/guild.py index ec42bb3..f5fb72a 100644 --- a/src/guardden/models/guild.py +++ b/src/guardden/models/guild.py @@ -1,17 +1,10 @@ """Guild-related database models.""" -from datetime import datetime -from typing import TYPE_CHECKING - -from sqlalchemy import JSON, Boolean, Float, ForeignKey, Integer, String, Text -from sqlalchemy.dialects.postgresql import JSONB +from sqlalchemy import Boolean, ForeignKey, Integer, String from sqlalchemy.orm import Mapped, mapped_column, relationship from guardden.models.base import Base, SnowflakeID, TimestampMixin -if TYPE_CHECKING: - from guardden.models.moderation import ModerationLog, Strike - class Guild(Base, TimestampMixin): """Represents a Discord guild (server) configuration.""" @@ -27,15 +20,6 @@ class Guild(Base, TimestampMixin): settings: Mapped["GuildSettings"] = relationship( back_populates="guild", uselist=False, cascade="all, delete-orphan" ) - banned_words: Mapped[list["BannedWord"]] = relationship( - back_populates="guild", cascade="all, delete-orphan" - ) - moderation_logs: Mapped[list["ModerationLog"]] = relationship( - back_populates="guild", cascade="all, delete-orphan" - ) - strikes: Mapped[list["Strike"]] = relationship( - back_populates="guild", cascade="all, delete-orphan" - ) class GuildSettings(Base, TimestampMixin): @@ -51,94 +35,21 @@ class GuildSettings(Base, TimestampMixin): prefix: Mapped[str] = mapped_column(String(10), default="!", nullable=False) locale: Mapped[str] = mapped_column(String(10), default="en", nullable=False) - # Channel configuration (stored as snowflake IDs) - log_channel_id: Mapped[int | None] = mapped_column(SnowflakeID, nullable=True) - mod_log_channel_id: Mapped[int | None] = mapped_column(SnowflakeID, nullable=True) - welcome_channel_id: Mapped[int | None] = mapped_column(SnowflakeID, nullable=True) - - # Role configuration - mute_role_id: Mapped[int | None] = mapped_column(SnowflakeID, nullable=True) - verified_role_id: Mapped[int | None] = mapped_column(SnowflakeID, nullable=True) - mod_role_ids: Mapped[dict] = mapped_column( - JSONB().with_variant(JSON(), "sqlite"), default=list, nullable=False - ) - - # Moderation settings + # Spam detection settings automod_enabled: Mapped[bool] = mapped_column(Boolean, default=True, nullable=False) anti_spam_enabled: Mapped[bool] = mapped_column(Boolean, default=True, nullable=False) - link_filter_enabled: Mapped[bool] = mapped_column(Boolean, default=False, nullable=False) - - # Automod thresholds message_rate_limit: Mapped[int] = mapped_column(Integer, default=5, nullable=False) message_rate_window: Mapped[int] = mapped_column(Integer, default=5, nullable=False) duplicate_threshold: Mapped[int] = mapped_column(Integer, default=3, nullable=False) mention_limit: Mapped[int] = mapped_column(Integer, default=5, nullable=False) mention_rate_limit: Mapped[int] = mapped_column(Integer, default=10, nullable=False) mention_rate_window: Mapped[int] = mapped_column(Integer, default=60, nullable=False) - scam_allowlist: Mapped[list[str]] = mapped_column( - JSONB().with_variant(JSON(), "sqlite"), default=list, nullable=False - ) - - # Strike thresholds (actions at each threshold) - strike_actions: Mapped[dict] = mapped_column( - JSONB().with_variant(JSON(), "sqlite"), - default=lambda: { - "1": {"action": "warn"}, - "3": {"action": "timeout", "duration": 300}, - "5": {"action": "kick"}, - "7": {"action": "ban"}, - }, - nullable=False, - ) # AI moderation settings ai_moderation_enabled: Mapped[bool] = mapped_column(Boolean, default=True, nullable=False) - ai_sensitivity: Mapped[int] = mapped_column(Integer, default=80, nullable=False) # 0-100 scale - ai_confidence_threshold: Mapped[float] = mapped_column(Float, default=0.7, nullable=False) - ai_log_only: Mapped[bool] = mapped_column(Boolean, default=False, nullable=False) + ai_sensitivity: Mapped[int] = mapped_column(Integer, default=80, nullable=False) nsfw_detection_enabled: Mapped[bool] = mapped_column(Boolean, default=True, nullable=False) nsfw_only_filtering: Mapped[bool] = mapped_column(Boolean, default=True, nullable=False) - # Notification settings - send_in_channel_warnings: Mapped[bool] = mapped_column(Boolean, default=False, nullable=False) - - # Whitelist settings - whitelisted_user_ids: Mapped[list[int]] = mapped_column( - JSONB().with_variant(JSON(), "sqlite"), default=list, nullable=False - ) - - # Verification settings - verification_enabled: Mapped[bool] = mapped_column(Boolean, default=False, nullable=False) - verification_type: Mapped[str] = mapped_column( - String(20), default="button", nullable=False - ) # button, captcha, questions - # Relationship guild: Mapped["Guild"] = relationship(back_populates="settings") - - -class BannedWord(Base, TimestampMixin): - """Banned words/phrases for a guild with regex support.""" - - __tablename__ = "banned_words" - - id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True) - guild_id: Mapped[int] = mapped_column( - SnowflakeID, ForeignKey("guilds.id", ondelete="CASCADE"), nullable=False - ) - - pattern: Mapped[str] = mapped_column(Text, nullable=False) - is_regex: Mapped[bool] = mapped_column(Boolean, default=False, nullable=False) - action: Mapped[str] = mapped_column( - String(20), default="delete", nullable=False - ) # delete, warn, strike - reason: Mapped[str | None] = mapped_column(Text, nullable=True) - source: Mapped[str | None] = mapped_column(String(100), nullable=True) - category: Mapped[str | None] = mapped_column(String(20), nullable=True) - managed: Mapped[bool] = mapped_column(Boolean, default=False, nullable=False) - - # Who added this and when - added_by: Mapped[int] = mapped_column(SnowflakeID, nullable=False) - - # Relationship - guild: Mapped["Guild"] = relationship(back_populates="banned_words") diff --git a/src/guardden/services/ai/anthropic_provider.py b/src/guardden/services/ai/anthropic_provider.py index e729bf2..72e1f22 100644 --- a/src/guardden/services/ai/anthropic_provider.py +++ b/src/guardden/services/ai/anthropic_provider.py @@ -3,41 +3,10 @@ import logging from typing import Any -from guardden.services.ai.base import ( - AIProvider, - ImageAnalysisResult, - ModerationResult, - PhishingAnalysisResult, - parse_categories, - run_with_retries, -) +from guardden.services.ai.base import AIProvider, ImageAnalysisResult, run_with_retries logger = logging.getLogger(__name__) -# Content moderation system prompt -MODERATION_SYSTEM_PROMPT = """You are a content moderation AI for a Discord server. Analyze the given message and determine if it violates community guidelines. - -Categories to check: -- harassment: Personal attacks, bullying, intimidation -- hate_speech: Discrimination, slurs, dehumanization based on identity -- sexual: Explicit sexual content, sexual solicitation -- violence: Threats, graphic violence, encouraging harm -- self_harm: Suicide, self-injury content or encouragement -- spam: Repetitive, promotional, or low-quality content -- scam: Phishing attempts, fraudulent offers, impersonation -- misinformation: Dangerous false information - -Respond in this exact JSON format: -{ - "is_flagged": true/false, - "confidence": 0.0-1.0, - "categories": ["category1", "category2"], - "explanation": "Brief explanation", - "suggested_action": "none/warn/delete/timeout/ban" -} - -Be balanced - flag genuinely problematic content but allow normal conversation, jokes, and mild language. Consider context.""" - IMAGE_ANALYSIS_PROMPT = """Analyze this image for content moderation. Classify NSFW content with precision. NSFW CATEGORIES (in order of severity): @@ -76,24 +45,6 @@ NSFW SEVERITY GUIDELINES: - nudity: 60-75 - explicit: 80-100""" -PHISHING_ANALYSIS_PROMPT = """Analyze this URL and message context for phishing or scam indicators. - -Check for: -- Domain impersonation (typosquatting, lookalike domains) -- Urgency tactics ("act now", "limited time") -- Requests for credentials or personal info -- Too-good-to-be-true offers -- Suspicious redirects or URL shorteners -- Mismatched or hidden URLs - -Respond in this exact JSON format: -{ - "is_phishing": true/false, - "confidence": 0.0-1.0, - "risk_factors": ["factor1", "factor2"], - "explanation": "Brief explanation" -}""" - class AnthropicProvider(AIProvider): """AI provider using Anthropic's Claude API.""" @@ -150,47 +101,6 @@ class AnthropicProvider(AIProvider): return json.loads(text) - async def moderate_text( - self, - content: str, - context: str | None = None, - sensitivity: int = 50, - ) -> ModerationResult: - """Analyze text content for policy violations.""" - # Adjust prompt based on sensitivity - sensitivity_note = "" - if sensitivity < 30: - sensitivity_note = "\n\nBe lenient - only flag clearly problematic content." - elif sensitivity > 70: - sensitivity_note = "\n\nBe strict - flag anything potentially problematic." - - system = MODERATION_SYSTEM_PROMPT + sensitivity_note - - user_message = f"Message to analyze:\n{content}" - if context: - user_message = f"Context: {context}\n\n{user_message}" - - try: - response = await self._call_api(system, user_message) - data = self._parse_json_response(response) - - categories = parse_categories(data.get("categories", [])) - - return ModerationResult( - is_flagged=data.get("is_flagged", False), - confidence=float(data.get("confidence", 0.0)), - categories=categories, - explanation=data.get("explanation", ""), - suggested_action=data.get("suggested_action", "none"), - ) - - except Exception as e: - logger.error(f"Error moderating text: {e}") - return ModerationResult( - is_flagged=False, - explanation=f"Error analyzing content: {str(e)}", - ) - async def analyze_image( self, image_url: str, @@ -276,31 +186,6 @@ SENSITIVITY: BALANCED logger.error(f"Error analyzing image: {e}") return ImageAnalysisResult(description=f"Error analyzing image: {str(e)}") - async def analyze_phishing( - self, - url: str, - message_content: str | None = None, - ) -> PhishingAnalysisResult: - """Analyze a URL for phishing/scam indicators.""" - user_message = f"URL to analyze: {url}" - if message_content: - user_message += f"\n\nFull message context:\n{message_content}" - - try: - response = await self._call_api(PHISHING_ANALYSIS_PROMPT, user_message) - data = self._parse_json_response(response) - - return PhishingAnalysisResult( - is_phishing=data.get("is_phishing", False), - confidence=float(data.get("confidence", 0.0)), - risk_factors=data.get("risk_factors", []), - explanation=data.get("explanation", ""), - ) - - except Exception as e: - logger.error(f"Error analyzing phishing: {e}") - return PhishingAnalysisResult(explanation=f"Error analyzing URL: {str(e)}") - async def close(self) -> None: """Clean up resources.""" await self.client.close() diff --git a/src/guardden/services/ai/base.py b/src/guardden/services/ai/base.py index 29bd789..b9bcf3f 100644 --- a/src/guardden/services/ai/base.py +++ b/src/guardden/services/ai/base.py @@ -91,53 +91,6 @@ async def run_with_retries( raise RuntimeError("Retry loop exited unexpectedly") -@dataclass -class ModerationResult: - """Result of AI content moderation.""" - - is_flagged: bool = False - confidence: float = 0.0 # 0.0 to 1.0 - categories: list[ContentCategory] = field(default_factory=list) - explanation: str = "" - suggested_action: Literal["none", "warn", "delete", "timeout", "ban"] = "none" - severity_override: int | None = None # Direct severity for NSFW images - - @property - def severity(self) -> int: - """Get severity score 0-100 based on confidence and categories.""" - if not self.is_flagged: - return 0 - - # Use override if provided (e.g., from NSFW image analysis) - if self.severity_override is not None: - return min(self.severity_override, 100) - - # Base severity from confidence - severity = int(self.confidence * 50) - - # Add severity based on category - high_severity = { - ContentCategory.HATE_SPEECH, - ContentCategory.SELF_HARM, - ContentCategory.SCAM, - } - medium_severity = { - ContentCategory.HARASSMENT, - ContentCategory.VIOLENCE, - ContentCategory.SEXUAL, - } - - for cat in self.categories: - if cat in high_severity: - severity += 30 - elif cat in medium_severity: - severity += 20 - else: - severity += 10 - - return min(severity, 100) - - @dataclass class ImageAnalysisResult: """Result of AI image analysis.""" @@ -152,38 +105,8 @@ class ImageAnalysisResult: nsfw_severity: int = 0 # 0-100 specific NSFW severity score -@dataclass -class PhishingAnalysisResult: - """Result of AI phishing/scam analysis.""" - - is_phishing: bool = False - confidence: float = 0.0 - risk_factors: list[str] = field(default_factory=list) - explanation: str = "" - - class AIProvider(ABC): - """Abstract base class for AI providers.""" - - @abstractmethod - async def moderate_text( - self, - content: str, - context: str | None = None, - sensitivity: int = 50, - ) -> ModerationResult: - """ - Analyze text content for policy violations. - - Args: - content: The text to analyze - context: Optional context about the conversation/server - sensitivity: 0-100, higher means more strict - - Returns: - ModerationResult with analysis - """ - pass + """Abstract base class for AI providers - Image analysis only.""" @abstractmethod async def analyze_image( @@ -203,24 +126,6 @@ class AIProvider(ABC): """ pass - @abstractmethod - async def analyze_phishing( - self, - url: str, - message_content: str | None = None, - ) -> PhishingAnalysisResult: - """ - Analyze a URL for phishing/scam indicators. - - Args: - url: The URL to analyze - message_content: Optional full message for context - - Returns: - PhishingAnalysisResult with analysis - """ - pass - @abstractmethod async def close(self) -> None: """Clean up resources.""" diff --git a/src/guardden/services/ai/openai_provider.py b/src/guardden/services/ai/openai_provider.py index 103a2b1..7cbfe91 100644 --- a/src/guardden/services/ai/openai_provider.py +++ b/src/guardden/services/ai/openai_provider.py @@ -3,14 +3,7 @@ import logging from typing import Any -from guardden.services.ai.base import ( - AIProvider, - ContentCategory, - ImageAnalysisResult, - ModerationResult, - PhishingAnalysisResult, - run_with_retries, -) +from guardden.services.ai.base import AIProvider, ImageAnalysisResult, run_with_retries logger = logging.getLogger(__name__) @@ -35,107 +28,12 @@ class OpenAIProvider(AIProvider): self.model = model logger.info(f"Initialized OpenAI provider with model: {model}") - async def _call_api( - self, - system: str, - user_content: Any, - max_tokens: int = 500, - ) -> str: - """Make an API call to OpenAI.""" - - async def _request() -> str: - response = await self.client.chat.completions.create( - model=self.model, - max_tokens=max_tokens, - messages=[ - {"role": "system", "content": system}, - {"role": "user", "content": user_content}, - ], - response_format={"type": "json_object"}, - ) - return response.choices[0].message.content or "" - - try: - return await run_with_retries( - _request, - logger=logger, - operation_name="OpenAI chat completion", - ) - except Exception as e: - logger.error(f"OpenAI API error: {e}") - raise - def _parse_json_response(self, response: str) -> dict: """Parse JSON from response.""" import json return json.loads(response) - async def moderate_text( - self, - content: str, - context: str | None = None, - sensitivity: int = 50, - ) -> ModerationResult: - """Analyze text content for policy violations.""" - # First, use OpenAI's built-in moderation API for quick check - try: - - async def _moderate() -> Any: - return await self.client.moderations.create(input=content) - - mod_response = await run_with_retries( - _moderate, - logger=logger, - operation_name="OpenAI moderation", - ) - results = mod_response.results[0] - - # Map OpenAI categories to our categories - category_mapping = { - "harassment": ContentCategory.HARASSMENT, - "harassment/threatening": ContentCategory.HARASSMENT, - "hate": ContentCategory.HATE_SPEECH, - "hate/threatening": ContentCategory.HATE_SPEECH, - "self-harm": ContentCategory.SELF_HARM, - "self-harm/intent": ContentCategory.SELF_HARM, - "self-harm/instructions": ContentCategory.SELF_HARM, - "sexual": ContentCategory.SEXUAL, - "sexual/minors": ContentCategory.SEXUAL, - "violence": ContentCategory.VIOLENCE, - "violence/graphic": ContentCategory.VIOLENCE, - } - - flagged_categories = [] - max_score = 0.0 - - for category, score in results.category_scores.model_dump().items(): - if score > 0.5: # Threshold - if category in category_mapping: - flagged_categories.append(category_mapping[category]) - max_score = max(max_score, score) - - # Adjust threshold based on sensitivity - threshold = 0.3 + (0.4 * (100 - sensitivity) / 100) # 0.3 to 0.7 - - if results.flagged or max_score > threshold: - return ModerationResult( - is_flagged=True, - confidence=max_score, - categories=list(set(flagged_categories)), - explanation="Content flagged by moderation API", - suggested_action="delete" if max_score > 0.8 else "warn", - ) - - return ModerationResult(is_flagged=False, confidence=1.0 - max_score) - - except Exception as e: - logger.error(f"Error moderating text: {e}") - return ModerationResult( - is_flagged=False, - explanation=f"Error analyzing content: {str(e)}", - ) - async def analyze_image( self, image_url: str, @@ -223,41 +121,6 @@ NSFW SEVERITY GUIDELINES: none=0, suggestive=20-35, partial_nudity=40-55, nudity logger.error(f"Error analyzing image: {e}") return ImageAnalysisResult(description=f"Error analyzing image: {str(e)}") - async def analyze_phishing( - self, - url: str, - message_content: str | None = None, - ) -> PhishingAnalysisResult: - """Analyze a URL for phishing/scam indicators.""" - system = """Analyze the URL for phishing/scam indicators. Respond in JSON: -{ - "is_phishing": true/false, - "confidence": 0.0-1.0, - "risk_factors": ["factor1"], - "explanation": "Brief explanation" -} - -Check for: domain impersonation, urgency tactics, credential requests, too-good-to-be-true offers.""" - - user_message = f"URL: {url}" - if message_content: - user_message += f"\n\nMessage context: {message_content}" - - try: - response = await self._call_api(system, user_message) - data = self._parse_json_response(response) - - return PhishingAnalysisResult( - is_phishing=data.get("is_phishing", False), - confidence=float(data.get("confidence", 0.0)), - risk_factors=data.get("risk_factors", []), - explanation=data.get("explanation", ""), - ) - - except Exception as e: - logger.error(f"Error analyzing phishing: {e}") - return PhishingAnalysisResult(explanation=f"Error analyzing URL: {str(e)}") - async def close(self) -> None: """Clean up resources.""" await self.client.close()