first commit
This commit is contained in:
371
tools/ai-review/enterprise/metrics.py
Normal file
371
tools/ai-review/enterprise/metrics.py
Normal file
@@ -0,0 +1,371 @@
|
||||
"""Metrics Collector
|
||||
|
||||
Observability metrics for AI agent performance monitoring.
|
||||
Tracks request counts, latencies, errors, and LLM usage.
|
||||
"""
|
||||
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime, timedelta
|
||||
from threading import Lock
|
||||
|
||||
|
||||
@dataclass
|
||||
class MetricPoint:
|
||||
"""A single metric data point."""
|
||||
|
||||
timestamp: datetime
|
||||
value: float
|
||||
labels: dict = field(default_factory=dict)
|
||||
|
||||
|
||||
class Counter:
|
||||
"""Thread-safe counter metric."""
|
||||
|
||||
def __init__(self, name: str, description: str = ""):
|
||||
self.name = name
|
||||
self.description = description
|
||||
self._value = 0.0
|
||||
self._lock = Lock()
|
||||
|
||||
def inc(self, value: float = 1.0):
|
||||
"""Increment the counter."""
|
||||
with self._lock:
|
||||
self._value += value
|
||||
|
||||
@property
|
||||
def value(self) -> float:
|
||||
"""Get current counter value."""
|
||||
with self._lock:
|
||||
return self._value
|
||||
|
||||
|
||||
class Gauge:
|
||||
"""Thread-safe gauge metric."""
|
||||
|
||||
def __init__(self, name: str, description: str = ""):
|
||||
self.name = name
|
||||
self.description = description
|
||||
self._value = 0.0
|
||||
self._lock = Lock()
|
||||
|
||||
def set(self, value: float):
|
||||
"""Set the gauge value."""
|
||||
with self._lock:
|
||||
self._value = value
|
||||
|
||||
def inc(self, value: float = 1.0):
|
||||
"""Increment the gauge."""
|
||||
with self._lock:
|
||||
self._value += value
|
||||
|
||||
def dec(self, value: float = 1.0):
|
||||
"""Decrement the gauge."""
|
||||
with self._lock:
|
||||
self._value -= value
|
||||
|
||||
@property
|
||||
def value(self) -> float:
|
||||
"""Get current gauge value."""
|
||||
with self._lock:
|
||||
return self._value
|
||||
|
||||
|
||||
class Histogram:
|
||||
"""Simple histogram for tracking distributions."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
name: str,
|
||||
description: str = "",
|
||||
buckets: list[float] | None = None,
|
||||
):
|
||||
self.name = name
|
||||
self.description = description
|
||||
self.buckets = buckets or [0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0]
|
||||
self._values: list[float] = []
|
||||
self._lock = Lock()
|
||||
|
||||
def observe(self, value: float):
|
||||
"""Record an observation."""
|
||||
with self._lock:
|
||||
self._values.append(value)
|
||||
# Keep only last 1000 observations
|
||||
if len(self._values) > 1000:
|
||||
self._values = self._values[-1000:]
|
||||
|
||||
def get_percentile(self, percentile: float) -> float:
|
||||
"""Get a percentile value."""
|
||||
with self._lock:
|
||||
if not self._values:
|
||||
return 0.0
|
||||
sorted_values = sorted(self._values)
|
||||
idx = int(len(sorted_values) * percentile / 100)
|
||||
return sorted_values[min(idx, len(sorted_values) - 1)]
|
||||
|
||||
@property
|
||||
def count(self) -> int:
|
||||
"""Get observation count."""
|
||||
with self._lock:
|
||||
return len(self._values)
|
||||
|
||||
@property
|
||||
def sum(self) -> float:
|
||||
"""Get sum of observations."""
|
||||
with self._lock:
|
||||
return sum(self._values)
|
||||
|
||||
|
||||
class MetricsCollector:
|
||||
"""Central metrics collector for AI agents."""
|
||||
|
||||
def __init__(self, enabled: bool = True):
|
||||
"""Initialize metrics collector.
|
||||
|
||||
Args:
|
||||
enabled: Whether metrics collection is enabled.
|
||||
"""
|
||||
self.enabled = enabled
|
||||
self._start_time = time.time()
|
||||
|
||||
# Counters
|
||||
self.requests_total = Counter(
|
||||
"ai_review_requests_total",
|
||||
"Total number of review requests processed",
|
||||
)
|
||||
self.requests_success = Counter(
|
||||
"ai_review_requests_success",
|
||||
"Number of successful review requests",
|
||||
)
|
||||
self.requests_failed = Counter(
|
||||
"ai_review_requests_failed",
|
||||
"Number of failed review requests",
|
||||
)
|
||||
self.llm_calls_total = Counter(
|
||||
"ai_review_llm_calls_total",
|
||||
"Total number of LLM API calls",
|
||||
)
|
||||
self.llm_tokens_total = Counter(
|
||||
"ai_review_llm_tokens_total",
|
||||
"Total LLM tokens consumed",
|
||||
)
|
||||
self.comments_posted = Counter(
|
||||
"ai_review_comments_posted_total",
|
||||
"Total comments posted",
|
||||
)
|
||||
self.labels_applied = Counter(
|
||||
"ai_review_labels_applied_total",
|
||||
"Total labels applied",
|
||||
)
|
||||
self.security_findings = Counter(
|
||||
"ai_review_security_findings_total",
|
||||
"Total security findings detected",
|
||||
)
|
||||
|
||||
# Gauges
|
||||
self.active_requests = Gauge(
|
||||
"ai_review_active_requests",
|
||||
"Currently active review requests",
|
||||
)
|
||||
|
||||
# Histograms
|
||||
self.request_duration = Histogram(
|
||||
"ai_review_request_duration_seconds",
|
||||
"Request processing duration",
|
||||
)
|
||||
self.llm_duration = Histogram(
|
||||
"ai_review_llm_duration_seconds",
|
||||
"LLM API call duration",
|
||||
)
|
||||
|
||||
# Per-agent metrics
|
||||
self._agent_metrics: dict[str, dict] = {}
|
||||
|
||||
def record_request_start(self, agent: str):
|
||||
"""Record the start of a request.
|
||||
|
||||
Args:
|
||||
agent: Name of the agent handling the request.
|
||||
"""
|
||||
if not self.enabled:
|
||||
return
|
||||
|
||||
self.requests_total.inc()
|
||||
self.active_requests.inc()
|
||||
|
||||
if agent not in self._agent_metrics:
|
||||
self._agent_metrics[agent] = {
|
||||
"total": 0,
|
||||
"success": 0,
|
||||
"failed": 0,
|
||||
}
|
||||
self._agent_metrics[agent]["total"] += 1
|
||||
|
||||
def record_request_end(
|
||||
self,
|
||||
agent: str,
|
||||
success: bool,
|
||||
duration_seconds: float,
|
||||
):
|
||||
"""Record the end of a request.
|
||||
|
||||
Args:
|
||||
agent: Name of the agent.
|
||||
success: Whether the request succeeded.
|
||||
duration_seconds: Request duration.
|
||||
"""
|
||||
if not self.enabled:
|
||||
return
|
||||
|
||||
self.active_requests.dec()
|
||||
self.request_duration.observe(duration_seconds)
|
||||
|
||||
if success:
|
||||
self.requests_success.inc()
|
||||
if agent in self._agent_metrics:
|
||||
self._agent_metrics[agent]["success"] += 1
|
||||
else:
|
||||
self.requests_failed.inc()
|
||||
if agent in self._agent_metrics:
|
||||
self._agent_metrics[agent]["failed"] += 1
|
||||
|
||||
def record_llm_call(
|
||||
self,
|
||||
provider: str,
|
||||
model: str,
|
||||
tokens: int | None,
|
||||
duration_seconds: float,
|
||||
):
|
||||
"""Record an LLM API call.
|
||||
|
||||
Args:
|
||||
provider: LLM provider name.
|
||||
model: Model used.
|
||||
tokens: Tokens consumed.
|
||||
duration_seconds: Call duration.
|
||||
"""
|
||||
if not self.enabled:
|
||||
return
|
||||
|
||||
self.llm_calls_total.inc()
|
||||
self.llm_duration.observe(duration_seconds)
|
||||
if tokens:
|
||||
self.llm_tokens_total.inc(tokens)
|
||||
|
||||
def record_comment_posted(self):
|
||||
"""Record a comment being posted."""
|
||||
if self.enabled:
|
||||
self.comments_posted.inc()
|
||||
|
||||
def record_labels_applied(self, count: int = 1):
|
||||
"""Record labels being applied."""
|
||||
if self.enabled:
|
||||
self.labels_applied.inc(count)
|
||||
|
||||
def record_security_finding(self, severity: str):
|
||||
"""Record a security finding."""
|
||||
if self.enabled:
|
||||
self.security_findings.inc()
|
||||
|
||||
def get_summary(self) -> dict:
|
||||
"""Get a summary of all metrics.
|
||||
|
||||
Returns:
|
||||
Dictionary with metric summaries.
|
||||
"""
|
||||
uptime = time.time() - self._start_time
|
||||
|
||||
return {
|
||||
"uptime_seconds": uptime,
|
||||
"requests": {
|
||||
"total": self.requests_total.value,
|
||||
"success": self.requests_success.value,
|
||||
"failed": self.requests_failed.value,
|
||||
"active": self.active_requests.value,
|
||||
"success_rate": (
|
||||
self.requests_success.value / max(self.requests_total.value, 1)
|
||||
),
|
||||
},
|
||||
"llm": {
|
||||
"calls": self.llm_calls_total.value,
|
||||
"tokens": self.llm_tokens_total.value,
|
||||
"avg_duration_ms": (
|
||||
(self.llm_duration.sum / max(self.llm_duration.count, 1)) * 1000
|
||||
),
|
||||
"p50_duration_ms": self.llm_duration.get_percentile(50) * 1000,
|
||||
"p95_duration_ms": self.llm_duration.get_percentile(95) * 1000,
|
||||
},
|
||||
"actions": {
|
||||
"comments_posted": self.comments_posted.value,
|
||||
"labels_applied": self.labels_applied.value,
|
||||
"security_findings": self.security_findings.value,
|
||||
},
|
||||
"latency": {
|
||||
"avg_ms": (
|
||||
(self.request_duration.sum / max(self.request_duration.count, 1))
|
||||
* 1000
|
||||
),
|
||||
"p50_ms": self.request_duration.get_percentile(50) * 1000,
|
||||
"p95_ms": self.request_duration.get_percentile(95) * 1000,
|
||||
"p99_ms": self.request_duration.get_percentile(99) * 1000,
|
||||
},
|
||||
"by_agent": self._agent_metrics,
|
||||
}
|
||||
|
||||
def export_prometheus(self) -> str:
|
||||
"""Export metrics in Prometheus format.
|
||||
|
||||
Returns:
|
||||
Prometheus-formatted metrics string.
|
||||
"""
|
||||
lines = []
|
||||
|
||||
def add_metric(name: str, value: float, help_text: str = ""):
|
||||
if help_text:
|
||||
lines.append(f"# HELP {name} {help_text}")
|
||||
lines.append(f"{name} {value}")
|
||||
|
||||
add_metric(
|
||||
"ai_review_requests_total",
|
||||
self.requests_total.value,
|
||||
"Total review requests",
|
||||
)
|
||||
add_metric(
|
||||
"ai_review_requests_success_total",
|
||||
self.requests_success.value,
|
||||
"Successful requests",
|
||||
)
|
||||
add_metric(
|
||||
"ai_review_requests_failed_total",
|
||||
self.requests_failed.value,
|
||||
"Failed requests",
|
||||
)
|
||||
add_metric(
|
||||
"ai_review_llm_calls_total",
|
||||
self.llm_calls_total.value,
|
||||
"Total LLM calls",
|
||||
)
|
||||
add_metric(
|
||||
"ai_review_llm_tokens_total",
|
||||
self.llm_tokens_total.value,
|
||||
"Total LLM tokens",
|
||||
)
|
||||
add_metric(
|
||||
"ai_review_comments_posted_total",
|
||||
self.comments_posted.value,
|
||||
"Comments posted",
|
||||
)
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
# Global instance
|
||||
_metrics: MetricsCollector | None = None
|
||||
|
||||
|
||||
def get_metrics() -> MetricsCollector:
|
||||
"""Get the global metrics collector instance."""
|
||||
global _metrics
|
||||
if _metrics is None:
|
||||
_metrics = MetricsCollector()
|
||||
return _metrics
|
||||
Reference in New Issue
Block a user