From 853e7c9fcdf58c727a01d3b028ec6f2f9f745468 Mon Sep 17 00:00:00 2001 From: latte Date: Sun, 11 Jan 2026 21:04:49 +0100 Subject: [PATCH] Add monitoring, metrics tracking, and enhanced logging - Add BotMonitor class for tracking requests, errors, and health status - Track request success/failure, response times, error rates - Add !status command (admin) for detailed metrics with embed - Add !health command (anyone) for quick health check - Add rotating file logging support (LOG_FILE config) - Auto-detect unhealthy status when errors exceed threshold - Record errors with timestamps and context for debugging --- .env.example | 14 +- src/daemon_boyfriend/cogs/ai_chat.py | 9 + src/daemon_boyfriend/cogs/status.py | 147 ++++++++++++ src/daemon_boyfriend/config.py | 7 + src/daemon_boyfriend/utils/__init__.py | 5 + src/daemon_boyfriend/utils/logging.py | 26 ++- src/daemon_boyfriend/utils/monitoring.py | 282 +++++++++++++++++++++++ 7 files changed, 487 insertions(+), 3 deletions(-) create mode 100644 src/daemon_boyfriend/cogs/status.py create mode 100644 src/daemon_boyfriend/utils/monitoring.py diff --git a/.env.example b/.env.example index 50141b6..8768761 100644 --- a/.env.example +++ b/.env.example @@ -62,7 +62,19 @@ SEARXNG_ENABLED=true SEARXNG_MAX_RESULTS=5 # =========================================== -# Logging +# Logging & Monitoring # =========================================== # Log level: DEBUG, INFO, WARNING, ERROR LOG_LEVEL=INFO + +# Log file path (optional - logs to console only if not set) +# LOG_FILE=logs/bot.log + +# Custom log format (optional) +# LOG_FORMAT=%(asctime)s - %(name)s - %(levelname)s - %(message)s + +# =========================================== +# Monitoring Commands +# =========================================== +# Use !status (admin only) for detailed metrics +# Use !health (anyone) for quick health check diff --git a/src/daemon_boyfriend/cogs/ai_chat.py b/src/daemon_boyfriend/cogs/ai_chat.py index 1be2ddf..63d6b41 100644 --- a/src/daemon_boyfriend/cogs/ai_chat.py +++ b/src/daemon_boyfriend/cogs/ai_chat.py @@ -14,6 +14,7 @@ from daemon_boyfriend.services import ( Message, SearXNGService, ) +from daemon_boyfriend.utils import get_monitor logger = logging.getLogger(__name__) @@ -101,6 +102,9 @@ class AIChatCog(commands.Cog): return # Show typing indicator while generating response + monitor = get_monitor() + start_time = monitor.record_request_start() + async with message.channel.typing(): try: response_text = await self._generate_response(message, content) @@ -132,7 +136,12 @@ class AIChatCog(commands.Cog): for img_url in remaining_images: await message.channel.send(embed=self._create_image_embed(img_url)) + # Record successful request + monitor.record_request_success(start_time) + except Exception as e: + # Record failed request + monitor.record_request_failure(start_time, e, context="on_message") logger.error(f"Mention response error: {e}", exc_info=True) error_message = self._get_error_message(e) await message.reply(error_message) diff --git a/src/daemon_boyfriend/cogs/status.py b/src/daemon_boyfriend/cogs/status.py new file mode 100644 index 0000000..103a300 --- /dev/null +++ b/src/daemon_boyfriend/cogs/status.py @@ -0,0 +1,147 @@ +"""Status cog - provides bot health and metrics commands.""" + +import logging + +import discord +from discord.ext import commands + +from daemon_boyfriend.utils import HealthStatus, get_monitor + +logger = logging.getLogger(__name__) + + +class StatusCog(commands.Cog): + """Bot status and monitoring commands.""" + + def __init__(self, bot: commands.Bot) -> None: + self.bot = bot + + @commands.command(name="status") + @commands.has_permissions(administrator=True) + async def status_command(self, ctx: commands.Context) -> None: + """Show bot status and metrics (admin only).""" + monitor = get_monitor() + metrics = monitor.get_metrics() + + # Choose embed color based on health status + color_map = { + HealthStatus.HEALTHY: discord.Color.green(), + HealthStatus.DEGRADED: discord.Color.yellow(), + HealthStatus.UNHEALTHY: discord.Color.red(), + } + color = color_map.get(metrics.health_status, discord.Color.grey()) + + # Format uptime + uptime_hours = metrics.uptime_seconds / 3600 + if uptime_hours >= 24: + uptime_str = f"{uptime_hours / 24:.1f} days" + elif uptime_hours >= 1: + uptime_str = f"{uptime_hours:.1f} hours" + else: + uptime_str = f"{metrics.uptime_seconds / 60:.1f} minutes" + + # Create embed + embed = discord.Embed( + title="Bot Status Report", + color=color, + ) + + # Status field with emoji + status_emoji = { + HealthStatus.HEALTHY: "🟢", + HealthStatus.DEGRADED: "🟡", + HealthStatus.UNHEALTHY: "🔴", + } + embed.add_field( + name="Status", + value=f"{status_emoji.get(metrics.health_status, '⚪')} {metrics.health_status.value.upper()}", + inline=True, + ) + + embed.add_field(name="Uptime", value=uptime_str, inline=True) + embed.add_field(name="Guilds", value=str(len(self.bot.guilds)), inline=True) + + # Request metrics + embed.add_field( + name="Requests", + value=f"Total: {metrics.total_requests}\n" + f"Success: {metrics.successful_requests}\n" + f"Failed: {metrics.failed_requests}", + inline=True, + ) + + embed.add_field( + name="Performance", + value=f"Avg response: {metrics.avg_response_time_ms:.0f}ms\n" + f"Req/min: {metrics.requests_per_minute:.1f}\n" + f"Error rate: {metrics.error_rate_percent:.1f}%", + inline=True, + ) + + # Last activity + last_success = "Never" + if metrics.last_successful_request: + last_success = metrics.last_successful_request.strftime("%Y-%m-%d %H:%M:%S UTC") + + last_error = "None" + if metrics.last_error: + last_error = metrics.last_error.strftime("%Y-%m-%d %H:%M:%S UTC") + + embed.add_field( + name="Last Activity", + value=f"Success: {last_success}\nError: {last_error}", + inline=False, + ) + + # Recent errors (if any) + if metrics.recent_errors: + error_lines = [] + for error in metrics.recent_errors[-3:]: # Last 3 errors + time_str = error.timestamp.strftime("%H:%M:%S") + error_lines.append(f"`{time_str}` **{error.error_type}**: {error.message[:50]}...") + embed.add_field( + name="Recent Errors", + value="\n".join(error_lines), + inline=False, + ) + + await ctx.send(embed=embed) + + @commands.command(name="health") + async def health_command(self, ctx: commands.Context) -> None: + """Quick health check (anyone can use).""" + monitor = get_monitor() + metrics = monitor.get_metrics() + + status_emoji = { + HealthStatus.HEALTHY: "🟢", + HealthStatus.DEGRADED: "🟡", + HealthStatus.UNHEALTHY: "🔴", + } + + emoji = status_emoji.get(metrics.health_status, "⚪") + status = metrics.health_status.value + + # Format uptime + uptime_hours = metrics.uptime_seconds / 3600 + if uptime_hours >= 1: + uptime_str = f"{uptime_hours:.1f}h" + else: + uptime_str = f"{metrics.uptime_seconds / 60:.0f}m" + + await ctx.send( + f"{emoji} **{status.upper()}** | Uptime: {uptime_str} | Requests: {metrics.total_requests}" + ) + + @status_command.error + async def status_error(self, ctx: commands.Context, error: commands.CommandError) -> None: + """Handle status command errors.""" + if isinstance(error, commands.MissingPermissions): + await ctx.send("You need administrator permissions to view detailed status.") + else: + logger.error(f"Status command error: {error}") + + +async def setup(bot: commands.Bot) -> None: + """Load the Status cog.""" + await bot.add_cog(StatusCog(bot)) diff --git a/src/daemon_boyfriend/config.py b/src/daemon_boyfriend/config.py index 580ee29..20a2a99 100644 --- a/src/daemon_boyfriend/config.py +++ b/src/daemon_boyfriend/config.py @@ -34,6 +34,13 @@ class Settings(BaseSettings): # Logging log_level: str = Field("INFO", description="Logging level") + log_file: str | None = Field( + None, description="Log file path (optional, logs to console if not set)" + ) + log_format: str = Field( + "%(asctime)s - %(name)s - %(levelname)s - %(message)s", + description="Log message format", + ) # Bot Identity bot_name: str = Field("AI Bot", description="Bot display name") diff --git a/src/daemon_boyfriend/utils/__init__.py b/src/daemon_boyfriend/utils/__init__.py index 2cd7c5b..af55074 100644 --- a/src/daemon_boyfriend/utils/__init__.py +++ b/src/daemon_boyfriend/utils/__init__.py @@ -1,8 +1,13 @@ """Utility modules.""" from .logging import get_logger, setup_logging +from .monitoring import BotMonitor, HealthStatus, MetricsSnapshot, get_monitor __all__ = [ "setup_logging", "get_logger", + "BotMonitor", + "HealthStatus", + "MetricsSnapshot", + "get_monitor", ] diff --git a/src/daemon_boyfriend/utils/logging.py b/src/daemon_boyfriend/utils/logging.py index 0c222df..e976ab3 100644 --- a/src/daemon_boyfriend/utils/logging.py +++ b/src/daemon_boyfriend/utils/logging.py @@ -2,6 +2,8 @@ import logging import sys +from logging.handlers import RotatingFileHandler +from pathlib import Path from daemon_boyfriend.config import settings @@ -10,7 +12,7 @@ def setup_logging() -> None: """Configure application-wide logging.""" # Create formatter formatter = logging.Formatter( - "%(asctime)s - %(name)s - %(levelname)s - %(message)s", + settings.log_format, datefmt="%Y-%m-%d %H:%M:%S", ) @@ -21,11 +23,29 @@ def setup_logging() -> None: # Remove existing handlers root_logger.handlers.clear() - # Console handler + # Console handler (always enabled) console_handler = logging.StreamHandler(sys.stdout) console_handler.setFormatter(formatter) root_logger.addHandler(console_handler) + # File handler (if configured) + if settings.log_file: + log_path = Path(settings.log_file) + # Create directory if it doesn't exist + log_path.parent.mkdir(parents=True, exist_ok=True) + + # Use rotating file handler (10MB max, keep 5 backups) + file_handler = RotatingFileHandler( + log_path, + maxBytes=10 * 1024 * 1024, # 10MB + backupCount=5, + encoding="utf-8", + ) + file_handler.setFormatter(formatter) + root_logger.addHandler(file_handler) + + logging.info(f"Logging to file: {log_path}") + # Reduce noise from discord.py logging.getLogger("discord").setLevel(logging.WARNING) logging.getLogger("discord.http").setLevel(logging.WARNING) @@ -38,6 +58,8 @@ def setup_logging() -> None: logging.getLogger("httpx").setLevel(logging.WARNING) logging.getLogger("httpcore").setLevel(logging.WARNING) + logging.info(f"Logging initialized at {settings.log_level} level") + def get_logger(name: str) -> logging.Logger: """Get a logger with the given name.""" diff --git a/src/daemon_boyfriend/utils/monitoring.py b/src/daemon_boyfriend/utils/monitoring.py new file mode 100644 index 0000000..6b09c9f --- /dev/null +++ b/src/daemon_boyfriend/utils/monitoring.py @@ -0,0 +1,282 @@ +"""Monitoring and metrics tracking for the bot.""" + +import asyncio +import logging +import time +from collections import deque +from dataclasses import dataclass, field +from datetime import datetime, timezone +from enum import Enum +from typing import Callable + +logger = logging.getLogger(__name__) + + +class HealthStatus(Enum): + """Health status levels.""" + + HEALTHY = "healthy" + DEGRADED = "degraded" + UNHEALTHY = "unhealthy" + + +@dataclass +class ErrorRecord: + """Record of an error occurrence.""" + + timestamp: datetime + error_type: str + message: str + context: str = "" + + +@dataclass +class MetricsSnapshot: + """Snapshot of current metrics.""" + + uptime_seconds: float + total_requests: int + successful_requests: int + failed_requests: int + avg_response_time_ms: float + requests_per_minute: float + error_rate_percent: float + recent_errors: list[ErrorRecord] + health_status: HealthStatus + last_successful_request: datetime | None + last_error: datetime | None + + +class BotMonitor: + """Monitors bot health and collects metrics.""" + + def __init__( + self, + error_threshold: int = 10, + error_window_minutes: int = 5, + max_error_history: int = 50, + ) -> None: + """Initialize the monitor. + + Args: + error_threshold: Number of errors in window to trigger unhealthy status + error_window_minutes: Time window for error rate calculation + max_error_history: Maximum number of errors to keep in history + """ + self._start_time = time.time() + self._total_requests = 0 + self._successful_requests = 0 + self._failed_requests = 0 + self._response_times: deque[float] = deque(maxlen=100) + self._request_timestamps: deque[float] = deque(maxlen=1000) + self._errors: deque[ErrorRecord] = deque(maxlen=max_error_history) + self._error_threshold = error_threshold + self._error_window_minutes = error_window_minutes + self._last_successful_request: datetime | None = None + self._last_error: datetime | None = None + self._status_callbacks: list[Callable[[HealthStatus], None]] = [] + + def record_request_start(self) -> float: + """Record the start of a request. + + Returns: + Start timestamp for use with record_request_end + """ + self._total_requests += 1 + start_time = time.time() + self._request_timestamps.append(start_time) + return start_time + + def record_request_success(self, start_time: float) -> None: + """Record a successful request. + + Args: + start_time: Timestamp from record_request_start + """ + self._successful_requests += 1 + response_time = (time.time() - start_time) * 1000 # Convert to ms + self._response_times.append(response_time) + self._last_successful_request = datetime.now(timezone.utc) + + logger.debug(f"Request completed in {response_time:.2f}ms") + + def record_request_failure( + self, start_time: float, error: Exception, context: str = "" + ) -> None: + """Record a failed request. + + Args: + start_time: Timestamp from record_request_start + error: The exception that occurred + context: Additional context about where the error occurred + """ + self._failed_requests += 1 + response_time = (time.time() - start_time) * 1000 + self._response_times.append(response_time) + + error_record = ErrorRecord( + timestamp=datetime.now(timezone.utc), + error_type=type(error).__name__, + message=str(error)[:500], # Truncate long messages + context=context, + ) + self._errors.append(error_record) + self._last_error = error_record.timestamp + + logger.warning( + f"Request failed after {response_time:.2f}ms: " + f"[{error_record.error_type}] {error_record.message[:100]}" + ) + + # Check if we need to alert about health status change + new_status = self._calculate_health_status() + if new_status == HealthStatus.UNHEALTHY: + self._notify_status_change(new_status) + + def record_error(self, error: Exception, context: str = "") -> None: + """Record an error without request timing. + + Args: + error: The exception that occurred + context: Additional context about where the error occurred + """ + error_record = ErrorRecord( + timestamp=datetime.now(timezone.utc), + error_type=type(error).__name__, + message=str(error)[:500], + context=context, + ) + self._errors.append(error_record) + self._last_error = error_record.timestamp + + logger.error(f"Error recorded: [{error_record.error_type}] {error_record.message}") + + def on_status_change(self, callback: Callable[[HealthStatus], None]) -> None: + """Register a callback for health status changes. + + Args: + callback: Function to call when status changes to unhealthy + """ + self._status_callbacks.append(callback) + + def _notify_status_change(self, status: HealthStatus) -> None: + """Notify all registered callbacks of a status change.""" + for callback in self._status_callbacks: + try: + callback(status) + except Exception as e: + logger.error(f"Status callback failed: {e}") + + def _calculate_health_status(self) -> HealthStatus: + """Calculate the current health status.""" + now = time.time() + window_start = now - (self._error_window_minutes * 60) + + # Count recent errors + recent_error_count = sum(1 for e in self._errors if e.timestamp.timestamp() > window_start) + + if recent_error_count >= self._error_threshold: + return HealthStatus.UNHEALTHY + elif recent_error_count >= self._error_threshold // 2: + return HealthStatus.DEGRADED + else: + return HealthStatus.HEALTHY + + def _calculate_requests_per_minute(self) -> float: + """Calculate the current requests per minute rate.""" + now = time.time() + one_minute_ago = now - 60 + + recent_requests = sum(1 for t in self._request_timestamps if t > one_minute_ago) + return float(recent_requests) + + def get_metrics(self) -> MetricsSnapshot: + """Get a snapshot of current metrics. + + Returns: + MetricsSnapshot with all current metrics + """ + uptime = time.time() - self._start_time + + avg_response_time = 0.0 + if self._response_times: + avg_response_time = sum(self._response_times) / len(self._response_times) + + error_rate = 0.0 + if self._total_requests > 0: + error_rate = (self._failed_requests / self._total_requests) * 100 + + return MetricsSnapshot( + uptime_seconds=uptime, + total_requests=self._total_requests, + successful_requests=self._successful_requests, + failed_requests=self._failed_requests, + avg_response_time_ms=avg_response_time, + requests_per_minute=self._calculate_requests_per_minute(), + error_rate_percent=error_rate, + recent_errors=list(self._errors)[-10:], # Last 10 errors + health_status=self._calculate_health_status(), + last_successful_request=self._last_successful_request, + last_error=self._last_error, + ) + + def get_status_report(self) -> str: + """Get a formatted status report. + + Returns: + Human-readable status report + """ + metrics = self.get_metrics() + + # Format uptime + uptime_hours = metrics.uptime_seconds / 3600 + uptime_str = ( + f"{uptime_hours:.1f} hours" + if uptime_hours >= 1 + else f"{metrics.uptime_seconds / 60:.1f} minutes" + ) + + report = [ + "=== Bot Status Report ===", + f"Status: {metrics.health_status.value.upper()}", + f"Uptime: {uptime_str}", + "", + "--- Request Metrics ---", + f"Total requests: {metrics.total_requests}", + f"Successful: {metrics.successful_requests}", + f"Failed: {metrics.failed_requests}", + f"Error rate: {metrics.error_rate_percent:.1f}%", + f"Avg response time: {metrics.avg_response_time_ms:.0f}ms", + f"Requests/min: {metrics.requests_per_minute:.1f}", + ] + + if metrics.last_successful_request: + report.append(f"Last success: {metrics.last_successful_request.isoformat()}") + + if metrics.recent_errors: + report.append("") + report.append("--- Recent Errors ---") + for error in metrics.recent_errors[-5:]: + report.append( + f"[{error.timestamp.strftime('%H:%M:%S')}] {error.error_type}: {error.message[:50]}" + ) + + return "\n".join(report) + + +# Global monitor instance +_monitor: BotMonitor | None = None + + +def get_monitor() -> BotMonitor: + """Get the global monitor instance.""" + global _monitor + if _monitor is None: + _monitor = BotMonitor() + return _monitor + + +def reset_monitor() -> None: + """Reset the global monitor (mainly for testing).""" + global _monitor + _monitor = None