Add image and GIF vision support

- Add ImageAttachment dataclass for image metadata - Update Message to support list of image attachments - Update all providers (OpenAI, Anthropic, Gemini, OpenRouter) for vision - Extract images from Discord attachments and embeds in ai_chat.py - Supports PNG, JPEG, GIF, and WebP formats
2026-01-11 20:56:50 +01:00
parent 8f521b869b
commit 4ac123be9c
8 changed files with 187 additions and 12 deletions
--- a/src/daemon_boyfriend/cogs/ai_chat.py
+++ b/src/daemon_boyfriend/cogs/ai_chat.py
@@ -7,7 +7,13 @@ import discord
 from discord.ext import commands

 from daemon_boyfriend.config import settings
-from daemon_boyfriend.services import AIService, ConversationManager, Message, SearXNGService
+from daemon_boyfriend.services import (
+    AIService,
+    ConversationManager,
+    ImageAttachment,
+    Message,
+    SearXNGService,
+)

 logger = logging.getLogger(__name__)

@@ -125,6 +131,65 @@ class AIChatCog(commands.Cog):

        return content.strip()

+    def _extract_image_attachments(self, message: discord.Message) -> list[ImageAttachment]:
+        """Extract image attachments from a Discord message.
+
+        Args:
+            message: The Discord message
+
+        Returns:
+            List of ImageAttachment objects
+        """
+        images = []
+
+        # Supported image types
+        image_types = {
+            "image/png": "image/png",
+            "image/jpeg": "image/jpeg",
+            "image/jpg": "image/jpeg",
+            "image/gif": "image/gif",
+            "image/webp": "image/webp",
+        }
+
+        # Check message attachments
+        for attachment in message.attachments:
+            content_type = attachment.content_type or ""
+            if content_type in image_types:
+                images.append(
+                    ImageAttachment(
+                        url=attachment.url,
+                        media_type=image_types[content_type],
+                    )
+                )
+            # Also check by file extension if content_type not set
+            elif attachment.filename:
+                ext = attachment.filename.lower().split(".")[-1]
+                if ext in ("png", "jpg", "jpeg", "gif", "webp"):
+                    media_type = f"image/{ext}" if ext != "jpg" else "image/jpeg"
+                    images.append(
+                        ImageAttachment(
+                            url=attachment.url,
+                            media_type=media_type,
+                        )
+                    )
+
+        # Check embeds for images
+        for embed in message.embeds:
+            if embed.image and embed.image.url:
+                # Guess media type from URL
+                url = embed.image.url.lower()
+                media_type = "image/png"  # default
+                if ".jpg" in url or ".jpeg" in url:
+                    media_type = "image/jpeg"
+                elif ".gif" in url:
+                    media_type = "image/gif"
+                elif ".webp" in url:
+                    media_type = "image/webp"
+                images.append(ImageAttachment(url=embed.image.url, media_type=media_type))
+
+        logger.debug(f"Extracted {len(images)} images from message")
+        return images
+
    def _get_mentioned_users_context(self, message: discord.Message) -> str | None:
        """Get context about mentioned users (excluding the bot).

@@ -178,8 +243,12 @@ class AIChatCog(commands.Cog):
        # Get conversation history
        history = self.conversations.get_history(user_id)

-        # Add current message to history for the API call
-        messages = history + [Message(role="user", content=user_message)]
+        # Extract any image attachments from the message
+        images = self._extract_image_attachments(message)
+
+        # Add current message to history for the API call (with images if any)
+        current_message = Message(role="user", content=user_message, images=images)
+        messages = history + [current_message]

        # Check if we should search the web
        search_context = await self._maybe_search(user_message)