Add monitoring, metrics tracking, and enhanced logging
- Add BotMonitor class for tracking requests, errors, and health status - Track request success/failure, response times, error rates - Add !status command (admin) for detailed metrics with embed - Add !health command (anyone) for quick health check - Add rotating file logging support (LOG_FILE config) - Auto-detect unhealthy status when errors exceed threshold - Record errors with timestamps and context for debugging
This commit is contained in:
14
.env.example
14
.env.example
@@ -62,7 +62,19 @@ SEARXNG_ENABLED=true
|
|||||||
SEARXNG_MAX_RESULTS=5
|
SEARXNG_MAX_RESULTS=5
|
||||||
|
|
||||||
# ===========================================
|
# ===========================================
|
||||||
# Logging
|
# Logging & Monitoring
|
||||||
# ===========================================
|
# ===========================================
|
||||||
# Log level: DEBUG, INFO, WARNING, ERROR
|
# Log level: DEBUG, INFO, WARNING, ERROR
|
||||||
LOG_LEVEL=INFO
|
LOG_LEVEL=INFO
|
||||||
|
|
||||||
|
# Log file path (optional - logs to console only if not set)
|
||||||
|
# LOG_FILE=logs/bot.log
|
||||||
|
|
||||||
|
# Custom log format (optional)
|
||||||
|
# LOG_FORMAT=%(asctime)s - %(name)s - %(levelname)s - %(message)s
|
||||||
|
|
||||||
|
# ===========================================
|
||||||
|
# Monitoring Commands
|
||||||
|
# ===========================================
|
||||||
|
# Use !status (admin only) for detailed metrics
|
||||||
|
# Use !health (anyone) for quick health check
|
||||||
|
|||||||
@@ -14,6 +14,7 @@ from daemon_boyfriend.services import (
|
|||||||
Message,
|
Message,
|
||||||
SearXNGService,
|
SearXNGService,
|
||||||
)
|
)
|
||||||
|
from daemon_boyfriend.utils import get_monitor
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@@ -101,6 +102,9 @@ class AIChatCog(commands.Cog):
|
|||||||
return
|
return
|
||||||
|
|
||||||
# Show typing indicator while generating response
|
# Show typing indicator while generating response
|
||||||
|
monitor = get_monitor()
|
||||||
|
start_time = monitor.record_request_start()
|
||||||
|
|
||||||
async with message.channel.typing():
|
async with message.channel.typing():
|
||||||
try:
|
try:
|
||||||
response_text = await self._generate_response(message, content)
|
response_text = await self._generate_response(message, content)
|
||||||
@@ -132,7 +136,12 @@ class AIChatCog(commands.Cog):
|
|||||||
for img_url in remaining_images:
|
for img_url in remaining_images:
|
||||||
await message.channel.send(embed=self._create_image_embed(img_url))
|
await message.channel.send(embed=self._create_image_embed(img_url))
|
||||||
|
|
||||||
|
# Record successful request
|
||||||
|
monitor.record_request_success(start_time)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
# Record failed request
|
||||||
|
monitor.record_request_failure(start_time, e, context="on_message")
|
||||||
logger.error(f"Mention response error: {e}", exc_info=True)
|
logger.error(f"Mention response error: {e}", exc_info=True)
|
||||||
error_message = self._get_error_message(e)
|
error_message = self._get_error_message(e)
|
||||||
await message.reply(error_message)
|
await message.reply(error_message)
|
||||||
|
|||||||
147
src/daemon_boyfriend/cogs/status.py
Normal file
147
src/daemon_boyfriend/cogs/status.py
Normal file
@@ -0,0 +1,147 @@
|
|||||||
|
"""Status cog - provides bot health and metrics commands."""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
|
import discord
|
||||||
|
from discord.ext import commands
|
||||||
|
|
||||||
|
from daemon_boyfriend.utils import HealthStatus, get_monitor
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class StatusCog(commands.Cog):
|
||||||
|
"""Bot status and monitoring commands."""
|
||||||
|
|
||||||
|
def __init__(self, bot: commands.Bot) -> None:
|
||||||
|
self.bot = bot
|
||||||
|
|
||||||
|
@commands.command(name="status")
|
||||||
|
@commands.has_permissions(administrator=True)
|
||||||
|
async def status_command(self, ctx: commands.Context) -> None:
|
||||||
|
"""Show bot status and metrics (admin only)."""
|
||||||
|
monitor = get_monitor()
|
||||||
|
metrics = monitor.get_metrics()
|
||||||
|
|
||||||
|
# Choose embed color based on health status
|
||||||
|
color_map = {
|
||||||
|
HealthStatus.HEALTHY: discord.Color.green(),
|
||||||
|
HealthStatus.DEGRADED: discord.Color.yellow(),
|
||||||
|
HealthStatus.UNHEALTHY: discord.Color.red(),
|
||||||
|
}
|
||||||
|
color = color_map.get(metrics.health_status, discord.Color.grey())
|
||||||
|
|
||||||
|
# Format uptime
|
||||||
|
uptime_hours = metrics.uptime_seconds / 3600
|
||||||
|
if uptime_hours >= 24:
|
||||||
|
uptime_str = f"{uptime_hours / 24:.1f} days"
|
||||||
|
elif uptime_hours >= 1:
|
||||||
|
uptime_str = f"{uptime_hours:.1f} hours"
|
||||||
|
else:
|
||||||
|
uptime_str = f"{metrics.uptime_seconds / 60:.1f} minutes"
|
||||||
|
|
||||||
|
# Create embed
|
||||||
|
embed = discord.Embed(
|
||||||
|
title="Bot Status Report",
|
||||||
|
color=color,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Status field with emoji
|
||||||
|
status_emoji = {
|
||||||
|
HealthStatus.HEALTHY: "🟢",
|
||||||
|
HealthStatus.DEGRADED: "🟡",
|
||||||
|
HealthStatus.UNHEALTHY: "🔴",
|
||||||
|
}
|
||||||
|
embed.add_field(
|
||||||
|
name="Status",
|
||||||
|
value=f"{status_emoji.get(metrics.health_status, '⚪')} {metrics.health_status.value.upper()}",
|
||||||
|
inline=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
embed.add_field(name="Uptime", value=uptime_str, inline=True)
|
||||||
|
embed.add_field(name="Guilds", value=str(len(self.bot.guilds)), inline=True)
|
||||||
|
|
||||||
|
# Request metrics
|
||||||
|
embed.add_field(
|
||||||
|
name="Requests",
|
||||||
|
value=f"Total: {metrics.total_requests}\n"
|
||||||
|
f"Success: {metrics.successful_requests}\n"
|
||||||
|
f"Failed: {metrics.failed_requests}",
|
||||||
|
inline=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
embed.add_field(
|
||||||
|
name="Performance",
|
||||||
|
value=f"Avg response: {metrics.avg_response_time_ms:.0f}ms\n"
|
||||||
|
f"Req/min: {metrics.requests_per_minute:.1f}\n"
|
||||||
|
f"Error rate: {metrics.error_rate_percent:.1f}%",
|
||||||
|
inline=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Last activity
|
||||||
|
last_success = "Never"
|
||||||
|
if metrics.last_successful_request:
|
||||||
|
last_success = metrics.last_successful_request.strftime("%Y-%m-%d %H:%M:%S UTC")
|
||||||
|
|
||||||
|
last_error = "None"
|
||||||
|
if metrics.last_error:
|
||||||
|
last_error = metrics.last_error.strftime("%Y-%m-%d %H:%M:%S UTC")
|
||||||
|
|
||||||
|
embed.add_field(
|
||||||
|
name="Last Activity",
|
||||||
|
value=f"Success: {last_success}\nError: {last_error}",
|
||||||
|
inline=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Recent errors (if any)
|
||||||
|
if metrics.recent_errors:
|
||||||
|
error_lines = []
|
||||||
|
for error in metrics.recent_errors[-3:]: # Last 3 errors
|
||||||
|
time_str = error.timestamp.strftime("%H:%M:%S")
|
||||||
|
error_lines.append(f"`{time_str}` **{error.error_type}**: {error.message[:50]}...")
|
||||||
|
embed.add_field(
|
||||||
|
name="Recent Errors",
|
||||||
|
value="\n".join(error_lines),
|
||||||
|
inline=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
await ctx.send(embed=embed)
|
||||||
|
|
||||||
|
@commands.command(name="health")
|
||||||
|
async def health_command(self, ctx: commands.Context) -> None:
|
||||||
|
"""Quick health check (anyone can use)."""
|
||||||
|
monitor = get_monitor()
|
||||||
|
metrics = monitor.get_metrics()
|
||||||
|
|
||||||
|
status_emoji = {
|
||||||
|
HealthStatus.HEALTHY: "🟢",
|
||||||
|
HealthStatus.DEGRADED: "🟡",
|
||||||
|
HealthStatus.UNHEALTHY: "🔴",
|
||||||
|
}
|
||||||
|
|
||||||
|
emoji = status_emoji.get(metrics.health_status, "⚪")
|
||||||
|
status = metrics.health_status.value
|
||||||
|
|
||||||
|
# Format uptime
|
||||||
|
uptime_hours = metrics.uptime_seconds / 3600
|
||||||
|
if uptime_hours >= 1:
|
||||||
|
uptime_str = f"{uptime_hours:.1f}h"
|
||||||
|
else:
|
||||||
|
uptime_str = f"{metrics.uptime_seconds / 60:.0f}m"
|
||||||
|
|
||||||
|
await ctx.send(
|
||||||
|
f"{emoji} **{status.upper()}** | Uptime: {uptime_str} | Requests: {metrics.total_requests}"
|
||||||
|
)
|
||||||
|
|
||||||
|
@status_command.error
|
||||||
|
async def status_error(self, ctx: commands.Context, error: commands.CommandError) -> None:
|
||||||
|
"""Handle status command errors."""
|
||||||
|
if isinstance(error, commands.MissingPermissions):
|
||||||
|
await ctx.send("You need administrator permissions to view detailed status.")
|
||||||
|
else:
|
||||||
|
logger.error(f"Status command error: {error}")
|
||||||
|
|
||||||
|
|
||||||
|
async def setup(bot: commands.Bot) -> None:
|
||||||
|
"""Load the Status cog."""
|
||||||
|
await bot.add_cog(StatusCog(bot))
|
||||||
@@ -34,6 +34,13 @@ class Settings(BaseSettings):
|
|||||||
|
|
||||||
# Logging
|
# Logging
|
||||||
log_level: str = Field("INFO", description="Logging level")
|
log_level: str = Field("INFO", description="Logging level")
|
||||||
|
log_file: str | None = Field(
|
||||||
|
None, description="Log file path (optional, logs to console if not set)"
|
||||||
|
)
|
||||||
|
log_format: str = Field(
|
||||||
|
"%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
||||||
|
description="Log message format",
|
||||||
|
)
|
||||||
|
|
||||||
# Bot Identity
|
# Bot Identity
|
||||||
bot_name: str = Field("AI Bot", description="Bot display name")
|
bot_name: str = Field("AI Bot", description="Bot display name")
|
||||||
|
|||||||
@@ -1,8 +1,13 @@
|
|||||||
"""Utility modules."""
|
"""Utility modules."""
|
||||||
|
|
||||||
from .logging import get_logger, setup_logging
|
from .logging import get_logger, setup_logging
|
||||||
|
from .monitoring import BotMonitor, HealthStatus, MetricsSnapshot, get_monitor
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"setup_logging",
|
"setup_logging",
|
||||||
"get_logger",
|
"get_logger",
|
||||||
|
"BotMonitor",
|
||||||
|
"HealthStatus",
|
||||||
|
"MetricsSnapshot",
|
||||||
|
"get_monitor",
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -2,6 +2,8 @@
|
|||||||
|
|
||||||
import logging
|
import logging
|
||||||
import sys
|
import sys
|
||||||
|
from logging.handlers import RotatingFileHandler
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
from daemon_boyfriend.config import settings
|
from daemon_boyfriend.config import settings
|
||||||
|
|
||||||
@@ -10,7 +12,7 @@ def setup_logging() -> None:
|
|||||||
"""Configure application-wide logging."""
|
"""Configure application-wide logging."""
|
||||||
# Create formatter
|
# Create formatter
|
||||||
formatter = logging.Formatter(
|
formatter = logging.Formatter(
|
||||||
"%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
settings.log_format,
|
||||||
datefmt="%Y-%m-%d %H:%M:%S",
|
datefmt="%Y-%m-%d %H:%M:%S",
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -21,11 +23,29 @@ def setup_logging() -> None:
|
|||||||
# Remove existing handlers
|
# Remove existing handlers
|
||||||
root_logger.handlers.clear()
|
root_logger.handlers.clear()
|
||||||
|
|
||||||
# Console handler
|
# Console handler (always enabled)
|
||||||
console_handler = logging.StreamHandler(sys.stdout)
|
console_handler = logging.StreamHandler(sys.stdout)
|
||||||
console_handler.setFormatter(formatter)
|
console_handler.setFormatter(formatter)
|
||||||
root_logger.addHandler(console_handler)
|
root_logger.addHandler(console_handler)
|
||||||
|
|
||||||
|
# File handler (if configured)
|
||||||
|
if settings.log_file:
|
||||||
|
log_path = Path(settings.log_file)
|
||||||
|
# Create directory if it doesn't exist
|
||||||
|
log_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# Use rotating file handler (10MB max, keep 5 backups)
|
||||||
|
file_handler = RotatingFileHandler(
|
||||||
|
log_path,
|
||||||
|
maxBytes=10 * 1024 * 1024, # 10MB
|
||||||
|
backupCount=5,
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
file_handler.setFormatter(formatter)
|
||||||
|
root_logger.addHandler(file_handler)
|
||||||
|
|
||||||
|
logging.info(f"Logging to file: {log_path}")
|
||||||
|
|
||||||
# Reduce noise from discord.py
|
# Reduce noise from discord.py
|
||||||
logging.getLogger("discord").setLevel(logging.WARNING)
|
logging.getLogger("discord").setLevel(logging.WARNING)
|
||||||
logging.getLogger("discord.http").setLevel(logging.WARNING)
|
logging.getLogger("discord.http").setLevel(logging.WARNING)
|
||||||
@@ -38,6 +58,8 @@ def setup_logging() -> None:
|
|||||||
logging.getLogger("httpx").setLevel(logging.WARNING)
|
logging.getLogger("httpx").setLevel(logging.WARNING)
|
||||||
logging.getLogger("httpcore").setLevel(logging.WARNING)
|
logging.getLogger("httpcore").setLevel(logging.WARNING)
|
||||||
|
|
||||||
|
logging.info(f"Logging initialized at {settings.log_level} level")
|
||||||
|
|
||||||
|
|
||||||
def get_logger(name: str) -> logging.Logger:
|
def get_logger(name: str) -> logging.Logger:
|
||||||
"""Get a logger with the given name."""
|
"""Get a logger with the given name."""
|
||||||
|
|||||||
282
src/daemon_boyfriend/utils/monitoring.py
Normal file
282
src/daemon_boyfriend/utils/monitoring.py
Normal file
@@ -0,0 +1,282 @@
|
|||||||
|
"""Monitoring and metrics tracking for the bot."""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import logging
|
||||||
|
import time
|
||||||
|
from collections import deque
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from enum import Enum
|
||||||
|
from typing import Callable
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class HealthStatus(Enum):
|
||||||
|
"""Health status levels."""
|
||||||
|
|
||||||
|
HEALTHY = "healthy"
|
||||||
|
DEGRADED = "degraded"
|
||||||
|
UNHEALTHY = "unhealthy"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ErrorRecord:
|
||||||
|
"""Record of an error occurrence."""
|
||||||
|
|
||||||
|
timestamp: datetime
|
||||||
|
error_type: str
|
||||||
|
message: str
|
||||||
|
context: str = ""
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class MetricsSnapshot:
|
||||||
|
"""Snapshot of current metrics."""
|
||||||
|
|
||||||
|
uptime_seconds: float
|
||||||
|
total_requests: int
|
||||||
|
successful_requests: int
|
||||||
|
failed_requests: int
|
||||||
|
avg_response_time_ms: float
|
||||||
|
requests_per_minute: float
|
||||||
|
error_rate_percent: float
|
||||||
|
recent_errors: list[ErrorRecord]
|
||||||
|
health_status: HealthStatus
|
||||||
|
last_successful_request: datetime | None
|
||||||
|
last_error: datetime | None
|
||||||
|
|
||||||
|
|
||||||
|
class BotMonitor:
|
||||||
|
"""Monitors bot health and collects metrics."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
error_threshold: int = 10,
|
||||||
|
error_window_minutes: int = 5,
|
||||||
|
max_error_history: int = 50,
|
||||||
|
) -> None:
|
||||||
|
"""Initialize the monitor.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
error_threshold: Number of errors in window to trigger unhealthy status
|
||||||
|
error_window_minutes: Time window for error rate calculation
|
||||||
|
max_error_history: Maximum number of errors to keep in history
|
||||||
|
"""
|
||||||
|
self._start_time = time.time()
|
||||||
|
self._total_requests = 0
|
||||||
|
self._successful_requests = 0
|
||||||
|
self._failed_requests = 0
|
||||||
|
self._response_times: deque[float] = deque(maxlen=100)
|
||||||
|
self._request_timestamps: deque[float] = deque(maxlen=1000)
|
||||||
|
self._errors: deque[ErrorRecord] = deque(maxlen=max_error_history)
|
||||||
|
self._error_threshold = error_threshold
|
||||||
|
self._error_window_minutes = error_window_minutes
|
||||||
|
self._last_successful_request: datetime | None = None
|
||||||
|
self._last_error: datetime | None = None
|
||||||
|
self._status_callbacks: list[Callable[[HealthStatus], None]] = []
|
||||||
|
|
||||||
|
def record_request_start(self) -> float:
|
||||||
|
"""Record the start of a request.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Start timestamp for use with record_request_end
|
||||||
|
"""
|
||||||
|
self._total_requests += 1
|
||||||
|
start_time = time.time()
|
||||||
|
self._request_timestamps.append(start_time)
|
||||||
|
return start_time
|
||||||
|
|
||||||
|
def record_request_success(self, start_time: float) -> None:
|
||||||
|
"""Record a successful request.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
start_time: Timestamp from record_request_start
|
||||||
|
"""
|
||||||
|
self._successful_requests += 1
|
||||||
|
response_time = (time.time() - start_time) * 1000 # Convert to ms
|
||||||
|
self._response_times.append(response_time)
|
||||||
|
self._last_successful_request = datetime.now(timezone.utc)
|
||||||
|
|
||||||
|
logger.debug(f"Request completed in {response_time:.2f}ms")
|
||||||
|
|
||||||
|
def record_request_failure(
|
||||||
|
self, start_time: float, error: Exception, context: str = ""
|
||||||
|
) -> None:
|
||||||
|
"""Record a failed request.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
start_time: Timestamp from record_request_start
|
||||||
|
error: The exception that occurred
|
||||||
|
context: Additional context about where the error occurred
|
||||||
|
"""
|
||||||
|
self._failed_requests += 1
|
||||||
|
response_time = (time.time() - start_time) * 1000
|
||||||
|
self._response_times.append(response_time)
|
||||||
|
|
||||||
|
error_record = ErrorRecord(
|
||||||
|
timestamp=datetime.now(timezone.utc),
|
||||||
|
error_type=type(error).__name__,
|
||||||
|
message=str(error)[:500], # Truncate long messages
|
||||||
|
context=context,
|
||||||
|
)
|
||||||
|
self._errors.append(error_record)
|
||||||
|
self._last_error = error_record.timestamp
|
||||||
|
|
||||||
|
logger.warning(
|
||||||
|
f"Request failed after {response_time:.2f}ms: "
|
||||||
|
f"[{error_record.error_type}] {error_record.message[:100]}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check if we need to alert about health status change
|
||||||
|
new_status = self._calculate_health_status()
|
||||||
|
if new_status == HealthStatus.UNHEALTHY:
|
||||||
|
self._notify_status_change(new_status)
|
||||||
|
|
||||||
|
def record_error(self, error: Exception, context: str = "") -> None:
|
||||||
|
"""Record an error without request timing.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
error: The exception that occurred
|
||||||
|
context: Additional context about where the error occurred
|
||||||
|
"""
|
||||||
|
error_record = ErrorRecord(
|
||||||
|
timestamp=datetime.now(timezone.utc),
|
||||||
|
error_type=type(error).__name__,
|
||||||
|
message=str(error)[:500],
|
||||||
|
context=context,
|
||||||
|
)
|
||||||
|
self._errors.append(error_record)
|
||||||
|
self._last_error = error_record.timestamp
|
||||||
|
|
||||||
|
logger.error(f"Error recorded: [{error_record.error_type}] {error_record.message}")
|
||||||
|
|
||||||
|
def on_status_change(self, callback: Callable[[HealthStatus], None]) -> None:
|
||||||
|
"""Register a callback for health status changes.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
callback: Function to call when status changes to unhealthy
|
||||||
|
"""
|
||||||
|
self._status_callbacks.append(callback)
|
||||||
|
|
||||||
|
def _notify_status_change(self, status: HealthStatus) -> None:
|
||||||
|
"""Notify all registered callbacks of a status change."""
|
||||||
|
for callback in self._status_callbacks:
|
||||||
|
try:
|
||||||
|
callback(status)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Status callback failed: {e}")
|
||||||
|
|
||||||
|
def _calculate_health_status(self) -> HealthStatus:
|
||||||
|
"""Calculate the current health status."""
|
||||||
|
now = time.time()
|
||||||
|
window_start = now - (self._error_window_minutes * 60)
|
||||||
|
|
||||||
|
# Count recent errors
|
||||||
|
recent_error_count = sum(1 for e in self._errors if e.timestamp.timestamp() > window_start)
|
||||||
|
|
||||||
|
if recent_error_count >= self._error_threshold:
|
||||||
|
return HealthStatus.UNHEALTHY
|
||||||
|
elif recent_error_count >= self._error_threshold // 2:
|
||||||
|
return HealthStatus.DEGRADED
|
||||||
|
else:
|
||||||
|
return HealthStatus.HEALTHY
|
||||||
|
|
||||||
|
def _calculate_requests_per_minute(self) -> float:
|
||||||
|
"""Calculate the current requests per minute rate."""
|
||||||
|
now = time.time()
|
||||||
|
one_minute_ago = now - 60
|
||||||
|
|
||||||
|
recent_requests = sum(1 for t in self._request_timestamps if t > one_minute_ago)
|
||||||
|
return float(recent_requests)
|
||||||
|
|
||||||
|
def get_metrics(self) -> MetricsSnapshot:
|
||||||
|
"""Get a snapshot of current metrics.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
MetricsSnapshot with all current metrics
|
||||||
|
"""
|
||||||
|
uptime = time.time() - self._start_time
|
||||||
|
|
||||||
|
avg_response_time = 0.0
|
||||||
|
if self._response_times:
|
||||||
|
avg_response_time = sum(self._response_times) / len(self._response_times)
|
||||||
|
|
||||||
|
error_rate = 0.0
|
||||||
|
if self._total_requests > 0:
|
||||||
|
error_rate = (self._failed_requests / self._total_requests) * 100
|
||||||
|
|
||||||
|
return MetricsSnapshot(
|
||||||
|
uptime_seconds=uptime,
|
||||||
|
total_requests=self._total_requests,
|
||||||
|
successful_requests=self._successful_requests,
|
||||||
|
failed_requests=self._failed_requests,
|
||||||
|
avg_response_time_ms=avg_response_time,
|
||||||
|
requests_per_minute=self._calculate_requests_per_minute(),
|
||||||
|
error_rate_percent=error_rate,
|
||||||
|
recent_errors=list(self._errors)[-10:], # Last 10 errors
|
||||||
|
health_status=self._calculate_health_status(),
|
||||||
|
last_successful_request=self._last_successful_request,
|
||||||
|
last_error=self._last_error,
|
||||||
|
)
|
||||||
|
|
||||||
|
def get_status_report(self) -> str:
|
||||||
|
"""Get a formatted status report.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Human-readable status report
|
||||||
|
"""
|
||||||
|
metrics = self.get_metrics()
|
||||||
|
|
||||||
|
# Format uptime
|
||||||
|
uptime_hours = metrics.uptime_seconds / 3600
|
||||||
|
uptime_str = (
|
||||||
|
f"{uptime_hours:.1f} hours"
|
||||||
|
if uptime_hours >= 1
|
||||||
|
else f"{metrics.uptime_seconds / 60:.1f} minutes"
|
||||||
|
)
|
||||||
|
|
||||||
|
report = [
|
||||||
|
"=== Bot Status Report ===",
|
||||||
|
f"Status: {metrics.health_status.value.upper()}",
|
||||||
|
f"Uptime: {uptime_str}",
|
||||||
|
"",
|
||||||
|
"--- Request Metrics ---",
|
||||||
|
f"Total requests: {metrics.total_requests}",
|
||||||
|
f"Successful: {metrics.successful_requests}",
|
||||||
|
f"Failed: {metrics.failed_requests}",
|
||||||
|
f"Error rate: {metrics.error_rate_percent:.1f}%",
|
||||||
|
f"Avg response time: {metrics.avg_response_time_ms:.0f}ms",
|
||||||
|
f"Requests/min: {metrics.requests_per_minute:.1f}",
|
||||||
|
]
|
||||||
|
|
||||||
|
if metrics.last_successful_request:
|
||||||
|
report.append(f"Last success: {metrics.last_successful_request.isoformat()}")
|
||||||
|
|
||||||
|
if metrics.recent_errors:
|
||||||
|
report.append("")
|
||||||
|
report.append("--- Recent Errors ---")
|
||||||
|
for error in metrics.recent_errors[-5:]:
|
||||||
|
report.append(
|
||||||
|
f"[{error.timestamp.strftime('%H:%M:%S')}] {error.error_type}: {error.message[:50]}"
|
||||||
|
)
|
||||||
|
|
||||||
|
return "\n".join(report)
|
||||||
|
|
||||||
|
|
||||||
|
# Global monitor instance
|
||||||
|
_monitor: BotMonitor | None = None
|
||||||
|
|
||||||
|
|
||||||
|
def get_monitor() -> BotMonitor:
|
||||||
|
"""Get the global monitor instance."""
|
||||||
|
global _monitor
|
||||||
|
if _monitor is None:
|
||||||
|
_monitor = BotMonitor()
|
||||||
|
return _monitor
|
||||||
|
|
||||||
|
|
||||||
|
def reset_monitor() -> None:
|
||||||
|
"""Reset the global monitor (mainly for testing)."""
|
||||||
|
global _monitor
|
||||||
|
_monitor = None
|
||||||
Reference in New Issue
Block a user