"""Metrics Collector Observability metrics for AI agent performance monitoring. Tracks request counts, latencies, errors, and LLM usage. """ import time from dataclasses import dataclass, field from datetime import datetime, timedelta from threading import Lock @dataclass class MetricPoint: """A single metric data point.""" timestamp: datetime value: float labels: dict = field(default_factory=dict) class Counter: """Thread-safe counter metric.""" def __init__(self, name: str, description: str = ""): self.name = name self.description = description self._value = 0.0 self._lock = Lock() def inc(self, value: float = 1.0): """Increment the counter.""" with self._lock: self._value += value @property def value(self) -> float: """Get current counter value.""" with self._lock: return self._value class Gauge: """Thread-safe gauge metric.""" def __init__(self, name: str, description: str = ""): self.name = name self.description = description self._value = 0.0 self._lock = Lock() def set(self, value: float): """Set the gauge value.""" with self._lock: self._value = value def inc(self, value: float = 1.0): """Increment the gauge.""" with self._lock: self._value += value def dec(self, value: float = 1.0): """Decrement the gauge.""" with self._lock: self._value -= value @property def value(self) -> float: """Get current gauge value.""" with self._lock: return self._value class Histogram: """Simple histogram for tracking distributions.""" def __init__( self, name: str, description: str = "", buckets: list[float] | None = None, ): self.name = name self.description = description self.buckets = buckets or [0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0] self._values: list[float] = [] self._lock = Lock() def observe(self, value: float): """Record an observation.""" with self._lock: self._values.append(value) # Keep only last 1000 observations if len(self._values) > 1000: self._values = self._values[-1000:] def get_percentile(self, percentile: float) -> float: """Get a percentile value.""" with self._lock: if not self._values: return 0.0 sorted_values = sorted(self._values) idx = int(len(sorted_values) * percentile / 100) return sorted_values[min(idx, len(sorted_values) - 1)] @property def count(self) -> int: """Get observation count.""" with self._lock: return len(self._values) @property def sum(self) -> float: """Get sum of observations.""" with self._lock: return sum(self._values) class MetricsCollector: """Central metrics collector for AI agents.""" def __init__(self, enabled: bool = True): """Initialize metrics collector. Args: enabled: Whether metrics collection is enabled. """ self.enabled = enabled self._start_time = time.time() # Counters self.requests_total = Counter( "ai_review_requests_total", "Total number of review requests processed", ) self.requests_success = Counter( "ai_review_requests_success", "Number of successful review requests", ) self.requests_failed = Counter( "ai_review_requests_failed", "Number of failed review requests", ) self.llm_calls_total = Counter( "ai_review_llm_calls_total", "Total number of LLM API calls", ) self.llm_tokens_total = Counter( "ai_review_llm_tokens_total", "Total LLM tokens consumed", ) self.comments_posted = Counter( "ai_review_comments_posted_total", "Total comments posted", ) self.labels_applied = Counter( "ai_review_labels_applied_total", "Total labels applied", ) self.security_findings = Counter( "ai_review_security_findings_total", "Total security findings detected", ) # Gauges self.active_requests = Gauge( "ai_review_active_requests", "Currently active review requests", ) # Histograms self.request_duration = Histogram( "ai_review_request_duration_seconds", "Request processing duration", ) self.llm_duration = Histogram( "ai_review_llm_duration_seconds", "LLM API call duration", ) # Per-agent metrics self._agent_metrics: dict[str, dict] = {} def record_request_start(self, agent: str): """Record the start of a request. Args: agent: Name of the agent handling the request. """ if not self.enabled: return self.requests_total.inc() self.active_requests.inc() if agent not in self._agent_metrics: self._agent_metrics[agent] = { "total": 0, "success": 0, "failed": 0, } self._agent_metrics[agent]["total"] += 1 def record_request_end( self, agent: str, success: bool, duration_seconds: float, ): """Record the end of a request. Args: agent: Name of the agent. success: Whether the request succeeded. duration_seconds: Request duration. """ if not self.enabled: return self.active_requests.dec() self.request_duration.observe(duration_seconds) if success: self.requests_success.inc() if agent in self._agent_metrics: self._agent_metrics[agent]["success"] += 1 else: self.requests_failed.inc() if agent in self._agent_metrics: self._agent_metrics[agent]["failed"] += 1 def record_llm_call( self, provider: str, model: str, tokens: int | None, duration_seconds: float, ): """Record an LLM API call. Args: provider: LLM provider name. model: Model used. tokens: Tokens consumed. duration_seconds: Call duration. """ if not self.enabled: return self.llm_calls_total.inc() self.llm_duration.observe(duration_seconds) if tokens: self.llm_tokens_total.inc(tokens) def record_comment_posted(self): """Record a comment being posted.""" if self.enabled: self.comments_posted.inc() def record_labels_applied(self, count: int = 1): """Record labels being applied.""" if self.enabled: self.labels_applied.inc(count) def record_security_finding(self, severity: str): """Record a security finding.""" if self.enabled: self.security_findings.inc() def get_summary(self) -> dict: """Get a summary of all metrics. Returns: Dictionary with metric summaries. """ uptime = time.time() - self._start_time return { "uptime_seconds": uptime, "requests": { "total": self.requests_total.value, "success": self.requests_success.value, "failed": self.requests_failed.value, "active": self.active_requests.value, "success_rate": ( self.requests_success.value / max(self.requests_total.value, 1) ), }, "llm": { "calls": self.llm_calls_total.value, "tokens": self.llm_tokens_total.value, "avg_duration_ms": ( (self.llm_duration.sum / max(self.llm_duration.count, 1)) * 1000 ), "p50_duration_ms": self.llm_duration.get_percentile(50) * 1000, "p95_duration_ms": self.llm_duration.get_percentile(95) * 1000, }, "actions": { "comments_posted": self.comments_posted.value, "labels_applied": self.labels_applied.value, "security_findings": self.security_findings.value, }, "latency": { "avg_ms": ( (self.request_duration.sum / max(self.request_duration.count, 1)) * 1000 ), "p50_ms": self.request_duration.get_percentile(50) * 1000, "p95_ms": self.request_duration.get_percentile(95) * 1000, "p99_ms": self.request_duration.get_percentile(99) * 1000, }, "by_agent": self._agent_metrics, } def export_prometheus(self) -> str: """Export metrics in Prometheus format. Returns: Prometheus-formatted metrics string. """ lines = [] def add_metric(name: str, value: float, help_text: str = ""): if help_text: lines.append(f"# HELP {name} {help_text}") lines.append(f"{name} {value}") add_metric( "ai_review_requests_total", self.requests_total.value, "Total review requests", ) add_metric( "ai_review_requests_success_total", self.requests_success.value, "Successful requests", ) add_metric( "ai_review_requests_failed_total", self.requests_failed.value, "Failed requests", ) add_metric( "ai_review_llm_calls_total", self.llm_calls_total.value, "Total LLM calls", ) add_metric( "ai_review_llm_tokens_total", self.llm_tokens_total.value, "Total LLM tokens", ) add_metric( "ai_review_comments_posted_total", self.comments_posted.value, "Comments posted", ) return "\n".join(lines) # Global instance _metrics: MetricsCollector | None = None def get_metrics() -> MetricsCollector: """Get the global metrics collector instance.""" global _metrics if _metrics is None: _metrics = MetricsCollector() return _metrics