Path Parameters
endpointId
: The unique identifier of the endpoint to get metrics for
Query Parameters
timeframe
: Time period for metrics (1h
,6h
,24h
,7d
,30d
) - defaults to24h
granularity
: Data point granularity (1m
,5m
,15m
,1h
,1d
) - defaults to15m
metrics
: Specific metrics to include (performance
,usage
,costs
,resources
,errors
)format
: Response format (json
,csv
,prometheus
) - defaults tojson
aggregation
: Aggregation method (avg
,sum
,min
,max
,p95
,p99
) for time series data
Example Usage
Basic Performance Metrics
Copy
curl -X GET "https://api.tensorone.ai/v2/endpoints/ep_1234567890abcdef/metrics" \
-H "Authorization: Bearer YOUR_API_KEY" \
-H "Content-Type: application/json"
Detailed Metrics with Custom Timeframe
Copy
curl -X GET "https://api.tensorone.ai/v2/endpoints/ep_1234567890abcdef/metrics?timeframe=7d&granularity=1h&metrics=performance,usage,costs,resources" \
-H "Authorization: Bearer YOUR_API_KEY" \
-H "Content-Type: application/json"
Cost Analysis with Breakdown
Copy
curl -X GET "https://api.tensorone.ai/v2/endpoints/ep_1234567890abcdef/metrics?timeframe=30d&metrics=costs&format=json" \
-H "Authorization: Bearer YOUR_API_KEY" \
-H "Content-Type: application/json"
Prometheus Format for Monitoring Integration
Copy
curl -X GET "https://api.tensorone.ai/v2/endpoints/ep_1234567890abcdef/metrics?format=prometheus" \
-H "Authorization: Bearer YOUR_API_KEY" \
-H "Accept: text/plain"
Batch Metrics for Multiple Endpoints
Copy
curl -X POST "https://api.tensorone.ai/v2/endpoints/metrics/batch" \
-H "Authorization: Bearer YOUR_API_KEY" \
-H "Content-Type: application/json" \
-d '{
"endpointIds": [
"ep_1234567890abcdef",
"ep_2345678901bcdefg",
"ep_3456789012cdefgh"
],
"timeframe": "24h",
"metrics": ["performance", "costs"],
"aggregation": "avg"
}'
Response
Comprehensive Metrics Response
Copy
{
"endpointId": "ep_1234567890abcdef",
"timeframe": "24h",
"granularity": "15m",
"generatedAt": "2024-01-15T14:35:22Z",
"summary": {
"totalExecutions": 2847,
"successfulExecutions": 2815,
"failedExecutions": 32,
"successRate": 98.88,
"totalCost": 47.32,
"averageLatency": 2.3,
"p95Latency": 4.8,
"p99Latency": 8.2,
"totalComputeTime": 6547.2,
"averageQueueTime": 0.15,
"coldStartCount": 23,
"coldStartRate": 0.81
},
"performance": {
"latency": {
"timeSeries": [
{
"timestamp": "2024-01-15T00:00:00Z",
"avg": 2.1,
"p50": 1.8,
"p95": 4.2,
"p99": 7.8,
"min": 0.9,
"max": 12.5
}
],
"trends": {
"hourly": {
"average": 2.3,
"trend": "stable",
"changePercent": -2.1
},
"daily": {
"average": 2.4,
"trend": "improving",
"changePercent": -8.5
}
}
},
"throughput": {
"timeSeries": [
{
"timestamp": "2024-01-15T00:00:00Z",
"requestsPerSecond": 8.2,
"tokensPerSecond": 456.7,
"itemsProcessed": 29400
}
],
"peak": {
"requestsPerSecond": 24.8,
"timestamp": "2024-01-15T09:30:00Z"
}
},
"errorRates": {
"timeSeries": [
{
"timestamp": "2024-01-15T00:00:00Z",
"errorRate": 1.2,
"timeoutRate": 0.3,
"resourceErrorRate": 0.2,
"userErrorRate": 0.7
}
],
"breakdown": {
"4xx": 18,
"5xx": 14,
"timeouts": 8,
"resourceExhausted": 5
}
}
},
"usage": {
"requests": {
"total": 2847,
"successful": 2815,
"failed": 32,
"byHour": [
{
"hour": "2024-01-15T00:00:00Z",
"count": 124,
"avgLatency": 2.1
}
],
"patterns": {
"peakHour": "09:00-10:00",
"lowestHour": "03:00-04:00",
"weekdayAverage": 2650,
"weekendAverage": 1890
}
},
"compute": {
"totalSeconds": 6547.2,
"gpuSeconds": 6234.8,
"cpuSeconds": 312.4,
"idleTime": 145.2,
"utilizationRate": 91.5,
"efficiency": {
"score": 87.3,
"suggestions": [
"Consider batch processing for similar requests",
"Optimize model size to reduce memory usage"
]
}
},
"data": {
"inputBytes": 45672819200,
"outputBytes": 12847392000,
"cacheHits": 847,
"cacheMisses": 2000,
"cacheHitRate": 29.7,
"bandwidth": {
"ingress": "142.5 MB/hour",
"egress": "40.2 MB/hour"
}
}
},
"costs": {
"total": 47.32,
"breakdown": {
"compute": 42.15,
"storage": 2.84,
"network": 1.47,
"other": 0.86
},
"byCategory": {
"inference": 38.92,
"coldStarts": 3.23,
"idleTime": 2.17,
"dataTransfer": 1.47,
"storage": 1.53
},
"timeSeries": [
{
"timestamp": "2024-01-15T00:00:00Z",
"cost": 1.85,
"computeCost": 1.67,
"storageCost": 0.12,
"networkCost": 0.06
}
],
"trends": {
"daily": {
"average": 47.32,
"trend": "increasing",
"changePercent": 12.4,
"projection": {
"monthly": 1419.60,
"confidence": 0.85
}
}
},
"optimization": {
"potentialSavings": 8.45,
"suggestions": [
{
"category": "cold_starts",
"description": "Enable warm pools to reduce cold start costs",
"savings": 3.23,
"effort": "low"
},
{
"category": "idle_time",
"description": "Implement auto-scaling to reduce idle time",
"savings": 2.17,
"effort": "medium"
},
{
"category": "resource_sizing",
"description": "Right-size GPU allocation based on usage patterns",
"savings": 3.05,
"effort": "high"
}
]
}
},
"resources": {
"gpu": {
"utilization": {
"average": 67.3,
"peak": 94.8,
"timeSeries": [
{
"timestamp": "2024-01-15T00:00:00Z",
"utilization": 65.2,
"memoryUsage": 78.4,
"temperature": 68
}
]
},
"memory": {
"average": 78.4,
"peak": 92.1,
"allocated": "40GB",
"efficiency": 82.7
},
"performance": {
"flops": 125.4,
"memoryBandwidth": 1250.8,
"efficiency": 89.2
}
},
"cpu": {
"utilization": {
"average": 23.8,
"peak": 67.2
},
"memory": {
"usage": 45.2,
"available": "64GB"
}
},
"scaling": {
"events": [
{
"timestamp": "2024-01-15T09:30:00Z",
"action": "scale_up",
"from": 2,
"to": 4,
"reason": "High request volume",
"duration": 45.2
}
],
"currentInstances": 3,
"targetInstances": 3,
"autoScalingEnabled": true
}
},
"coldStarts": {
"count": 23,
"rate": 0.81,
"averageDuration": 34.2,
"breakdown": {
"containerStart": 12.5,
"modelLoad": 18.7,
"dependencyLoad": 3.0
},
"impact": {
"latencyIncrease": 32.1,
"costIncrease": 3.23,
"userExperience": "moderate"
},
"optimization": {
"warmPoolRecommended": true,
"estimatedImprovement": {
"latencyReduction": 28.5,
"costReduction": 2.89
}
}
},
"insights": {
"performance": [
{
"type": "latency_spike",
"severity": "medium",
"message": "Latency increased by 15% during peak hours",
"timestamp": "2024-01-15T09:30:00Z",
"recommendation": "Consider enabling auto-scaling or warm pools"
}
],
"usage": [
{
"type": "usage_pattern",
"severity": "info",
"message": "Consistent daily usage pattern detected",
"recommendation": "Predictable usage allows for capacity planning optimization"
}
],
"costs": [
{
"type": "cost_trend",
"severity": "warning",
"message": "Monthly costs trending upward by 12.4%",
"recommendation": "Review usage patterns and consider optimization strategies"
}
]
}
}
Cost Breakdown Response
Copy
{
"endpointId": "ep_cost_analysis",
"timeframe": "30d",
"costs": {
"total": 1247.85,
"breakdown": {
"compute": {
"amount": 1089.24,
"percentage": 87.3,
"details": {
"gpuHours": 2847.5,
"rate": 0.382,
"tier": "premium"
}
},
"storage": {
"amount": 67.42,
"percentage": 5.4,
"details": {
"models": 45.20,
"outputs": 22.22
}
},
"network": {
"amount": 34.78,
"percentage": 2.8,
"details": {
"ingress": 12.45,
"egress": 22.33
}
},
"other": {
"amount": 56.41,
"percentage": 4.5,
"details": {
"coldStarts": 28.90,
"monitoring": 15.67,
"logging": 11.84
}
}
},
"trends": {
"daily": {
"average": 41.60,
"trend": "stable",
"changePercent": 2.1
},
"weekly": {
"average": 291.18,
"trend": "increasing",
"changePercent": 8.7
}
},
"budgetAnalysis": {
"monthlyBudget": 1500.00,
"currentUsage": 1247.85,
"remainingBudget": 252.15,
"projectedTotal": 1389.42,
"onTrack": true,
"burnRate": 41.60
},
"optimization": {
"totalPotentialSavings": 187.92,
"recommendations": [
{
"category": "resource_rightsizing",
"savings": 125.67,
"confidence": 0.92,
"description": "Downsize GPU during low-usage periods",
"implementation": "Configure auto-scaling policies"
},
{
"category": "warm_pools",
"savings": 34.56,
"confidence": 0.87,
"description": "Enable warm pools to reduce cold start costs",
"implementation": "Enable warm pool with 2 instances"
},
{
"category": "data_caching",
"savings": 27.69,
"confidence": 0.78,
"description": "Implement result caching for repeated requests",
"implementation": "Enable Redis cache with 1-hour TTL"
}
]
}
}
}
Prometheus Format Response
Copy
# HELP tensorone_endpoint_requests_total Total number of requests
# TYPE tensorone_endpoint_requests_total counter
tensorone_endpoint_requests_total{endpoint_id="ep_1234567890abcdef",status="success"} 2815
tensorone_endpoint_requests_total{endpoint_id="ep_1234567890abcdef",status="error"} 32
# HELP tensorone_endpoint_latency_seconds Request latency in seconds
# TYPE tensorone_endpoint_latency_seconds histogram
tensorone_endpoint_latency_seconds_bucket{endpoint_id="ep_1234567890abcdef",le="0.5"} 284
tensorone_endpoint_latency_seconds_bucket{endpoint_id="ep_1234567890abcdef",le="1.0"} 1256
tensorone_endpoint_latency_seconds_bucket{endpoint_id="ep_1234567890abcdef",le="2.0"} 2134
tensorone_endpoint_latency_seconds_bucket{endpoint_id="ep_1234567890abcdef",le="5.0"} 2678
tensorone_endpoint_latency_seconds_bucket{endpoint_id="ep_1234567890abcdef",le="10.0"} 2815
# HELP tensorone_endpoint_gpu_utilization GPU utilization percentage
# TYPE tensorone_endpoint_gpu_utilization gauge
tensorone_endpoint_gpu_utilization{endpoint_id="ep_1234567890abcdef"} 67.3
# HELP tensorone_endpoint_cost_total Total cost in USD
# TYPE tensorone_endpoint_cost_total counter
tensorone_endpoint_cost_total{endpoint_id="ep_1234567890abcdef",category="compute"} 42.15
tensorone_endpoint_cost_total{endpoint_id="ep_1234567890abcdef",category="storage"} 2.84
Metric Categories
Performance Metrics
- Latency: Response times with percentiles (p50, p95, p99)
- Throughput: Requests per second, tokens per second
- Error Rates: Success rates, failure breakdown by type
- Cold Starts: Frequency, duration, impact on performance
Usage Metrics
- Request Volume: Total requests, patterns, trends
- Compute Utilization: GPU/CPU usage, efficiency scores
- Data Transfer: Input/output volumes, bandwidth usage
- Cache Performance: Hit rates, miss rates, efficiency
Cost Metrics
- Total Costs: Comprehensive cost breakdown
- Cost Trends: Historical analysis and projections
- Optimization: Potential savings and recommendations
- Budget Tracking: Budget vs. actual spending analysis
Resource Metrics
- Hardware Utilization: GPU, CPU, memory usage
- Scaling Events: Auto-scaling activities and effectiveness
- Resource Efficiency: Utilization optimization scores
- Capacity Planning: Usage patterns and capacity recommendations
Time Series Data
Granularity Options
1m
: Minute-level granularity (last 6 hours max)5m
: 5-minute intervals (last 24 hours max)15m
: 15-minute intervals (last 7 days max)1h
: Hourly intervals (last 30 days max)1d
: Daily intervals (unlimited retention)
Aggregation Methods
avg
: Average values over time periodsum
: Sum of all valuesmin
: Minimum value in periodmax
: Maximum value in periodp95
: 95th percentilep99
: 99th percentile
Error Handling
400 Bad Request
Copy
{
"error": "INVALID_PARAMETERS",
"message": "Invalid timeframe specified",
"details": {
"parameter": "timeframe",
"value": "2y",
"allowedValues": ["1h", "6h", "24h", "7d", "30d"]
}
}
403 Forbidden
Copy
{
"error": "INSUFFICIENT_PERMISSIONS",
"message": "Metrics access requires analytics:read permission",
"details": {
"requiredPermission": "analytics:read",
"currentPermissions": ["endpoints:execute", "endpoints:read"]
}
}
429 Rate Limited
Copy
{
"error": "RATE_LIMIT_EXCEEDED",
"message": "Too many metrics requests",
"details": {
"limit": 60,
"window": "1h",
"retryAfter": 300,
"suggestion": "Use batch endpoints or increase polling interval"
}
}
SDK Examples
Python SDK
Copy
from tensorone import TensorOneClient
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
client = TensorOneClient(api_key="your_api_key")
# Basic metrics retrieval
def get_endpoint_metrics(endpoint_id, timeframe="24h"):
metrics = client.endpoints.get_metrics(
endpoint_id=endpoint_id,
timeframe=timeframe,
metrics=["performance", "usage", "costs", "resources"]
)
print(f"Metrics for {endpoint_id} (last {timeframe}):")
print(f"Total Executions: {metrics.summary.total_executions}")
print(f"Success Rate: {metrics.summary.success_rate:.2f}%")
print(f"Average Latency: {metrics.summary.average_latency:.2f}s")
print(f"Total Cost: ${metrics.summary.total_cost:.2f}")
return metrics
# Cost analysis and optimization
def analyze_costs(endpoint_id, timeframe="30d"):
metrics = client.endpoints.get_metrics(
endpoint_id=endpoint_id,
timeframe=timeframe,
metrics=["costs"]
)
costs = metrics.costs
print(f"Cost Analysis for {endpoint_id}:")
print(f"Total Cost: ${costs.total:.2f}")
print(f"Daily Average: ${costs.trends.daily.average:.2f}")
print(f"Monthly Projection: ${costs.trends.projection.monthly:.2f}")
# Optimization recommendations
if costs.optimization.potential_savings > 0:
print(f"\nOptimization Opportunities:")
print(f"Potential Savings: ${costs.optimization.potential_savings:.2f}")
for suggestion in costs.optimization.suggestions:
print(f"- {suggestion.description}")
print(f" Savings: ${suggestion.savings:.2f}")
print(f" Effort: {suggestion.effort}")
return costs
# Performance trend analysis
def analyze_performance_trends(endpoint_id, timeframe="7d"):
metrics = client.endpoints.get_metrics(
endpoint_id=endpoint_id,
timeframe=timeframe,
granularity="1h",
metrics=["performance"]
)
# Convert to pandas DataFrame for analysis
latency_data = []
for point in metrics.performance.latency.time_series:
latency_data.append({
'timestamp': point.timestamp,
'avg_latency': point.avg,
'p95_latency': point.p95,
'p99_latency': point.p99
})
df = pd.DataFrame(latency_data)
df['timestamp'] = pd.to_datetime(df['timestamp'])
# Performance analysis
avg_latency = df['avg_latency'].mean()
latency_trend = df['avg_latency'].diff().mean()
print(f"Performance Analysis:")
print(f"Average Latency: {avg_latency:.2f}s")
print(f"Latency Trend: {'Improving' if latency_trend < 0 else 'Degrading'}")
print(f"95th Percentile: {df['p95_latency'].mean():.2f}s")
print(f"99th Percentile: {df['p99_latency'].mean():.2f}s")
return df
# Resource utilization monitoring
def monitor_resource_utilization(endpoint_id, timeframe="24h"):
metrics = client.endpoints.get_metrics(
endpoint_id=endpoint_id,
timeframe=timeframe,
granularity="15m",
metrics=["resources"]
)
resources = metrics.resources
gpu = resources.gpu
print(f"Resource Utilization Analysis:")
print(f"Average GPU Utilization: {gpu.utilization.average:.1f}%")
print(f"Peak GPU Utilization: {gpu.utilization.peak:.1f}%")
print(f"GPU Memory Usage: {gpu.memory.average:.1f}%")
print(f"Resource Efficiency: {gpu.performance.efficiency:.1f}%")
# Scaling analysis
if resources.scaling.events:
print(f"\nScaling Events:")
for event in resources.scaling.events:
print(f"- {event.timestamp}: {event.action} from {event.from} to {event.to}")
print(f" Reason: {event.reason}")
return resources
# Cold start analysis
def analyze_cold_starts(endpoint_id, timeframe="7d"):
metrics = client.endpoints.get_metrics(
endpoint_id=endpoint_id,
timeframe=timeframe,
metrics=["performance"]
)
cold_starts = metrics.cold_starts
print(f"Cold Start Analysis:")
print(f"Cold Start Count: {cold_starts.count}")
print(f"Cold Start Rate: {cold_starts.rate:.2f}%")
print(f"Average Duration: {cold_starts.average_duration:.1f}s")
print(f"Latency Impact: +{cold_starts.impact.latency_increase:.1f}s")
print(f"Cost Impact: +${cold_starts.impact.cost_increase:.2f}")
if cold_starts.optimization.warm_pool_recommended:
improvement = cold_starts.optimization.estimated_improvement
print(f"\nOptimization Opportunity:")
print(f"Warm pools could reduce latency by {improvement.latency_reduction:.1f}s")
print(f"Estimated cost reduction: ${improvement.cost_reduction:.2f}")
return cold_starts
# Comprehensive dashboard data
def create_dashboard_data(endpoint_ids, timeframe="24h"):
"""Create comprehensive dashboard data for multiple endpoints"""
dashboard_data = []
for endpoint_id in endpoint_ids:
try:
metrics = client.endpoints.get_metrics(
endpoint_id=endpoint_id,
timeframe=timeframe,
metrics=["performance", "usage", "costs", "resources"]
)
dashboard_data.append({
'endpoint_id': endpoint_id,
'executions': metrics.summary.total_executions,
'success_rate': metrics.summary.success_rate,
'avg_latency': metrics.summary.average_latency,
'p95_latency': metrics.summary.p95_latency,
'total_cost': metrics.summary.total_cost,
'gpu_utilization': metrics.resources.gpu.utilization.average,
'error_rate': 100 - metrics.summary.success_rate,
'cold_starts': metrics.cold_starts.count if metrics.cold_starts else 0
})
except Exception as e:
print(f"Error fetching metrics for {endpoint_id}: {e}")
df = pd.DataFrame(dashboard_data)
return df
# Usage examples
if __name__ == "__main__":
endpoint_id = "ep_1234567890abcdef"
# Basic metrics
metrics = get_endpoint_metrics(endpoint_id)
# Cost analysis
costs = analyze_costs(endpoint_id, "30d")
# Performance trends
perf_df = analyze_performance_trends(endpoint_id, "7d")
# Resource monitoring
resources = monitor_resource_utilization(endpoint_id)
# Cold start analysis
cold_starts = analyze_cold_starts(endpoint_id)
# Dashboard for multiple endpoints
endpoints = ["ep_1234567890abcdef", "ep_2345678901bcdefg"]
dashboard_df = create_dashboard_data(endpoints)
print("\nDashboard Summary:")
print(dashboard_df.to_string(index=False))
JavaScript SDK
Copy
import { TensorOneClient } from "@tensorone/sdk";
import Chart from 'chart.js/auto';
const client = new TensorOneClient({ apiKey: "your_api_key" });
// Basic metrics retrieval
async function getEndpointMetrics(endpointId, timeframe = "24h") {
const metrics = await client.endpoints.getMetrics(endpointId, {
timeframe,
metrics: ["performance", "usage", "costs", "resources"]
});
console.log(`Metrics for ${endpointId} (last ${timeframe}):`);
console.log(`Total Executions: ${metrics.summary.totalExecutions}`);
console.log(`Success Rate: ${metrics.summary.successRate.toFixed(2)}%`);
console.log(`Average Latency: ${metrics.summary.averageLatency.toFixed(2)}s`);
console.log(`Total Cost: $${metrics.summary.totalCost.toFixed(2)}`);
return metrics;
}
// Real-time metrics dashboard
class MetricsDashboard {
constructor(endpointId, containerId) {
this.endpointId = endpointId;
this.container = document.getElementById(containerId);
this.charts = {};
this.initialize();
}
async initialize() {
await this.createCharts();
this.startRealTimeUpdates();
}
async createCharts() {
// Latency trend chart
const latencyCanvas = this.createCanvas('latency-chart');
this.charts.latency = new Chart(latencyCanvas, {
type: 'line',
data: {
labels: [],
datasets: [{
label: 'Average Latency',
data: [],
borderColor: 'rgb(75, 192, 192)',
tension: 0.1
}, {
label: 'P95 Latency',
data: [],
borderColor: 'rgb(255, 99, 132)',
tension: 0.1
}]
},
options: {
responsive: true,
plugins: {
title: {
display: true,
text: 'Latency Trends'
}
},
scales: {
y: {
beginAtZero: true,
title: {
display: true,
text: 'Latency (seconds)'
}
}
}
}
});
// Cost breakdown pie chart
const costCanvas = this.createCanvas('cost-chart');
this.charts.cost = new Chart(costCanvas, {
type: 'pie',
data: {
labels: ['Compute', 'Storage', 'Network', 'Other'],
datasets: [{
data: [],
backgroundColor: [
'rgb(255, 99, 132)',
'rgb(54, 162, 235)',
'rgb(255, 205, 86)',
'rgb(75, 192, 192)'
]
}]
},
options: {
responsive: true,
plugins: {
title: {
display: true,
text: 'Cost Breakdown'
}
}
}
});
// GPU utilization gauge
const gpuCanvas = this.createCanvas('gpu-chart');
this.charts.gpu = new Chart(gpuCanvas, {
type: 'doughnut',
data: {
labels: ['Used', 'Available'],
datasets: [{
data: [0, 100],
backgroundColor: ['rgb(255, 99, 132)', 'rgb(200, 200, 200)']
}]
},
options: {
responsive: true,
plugins: {
title: {
display: true,
text: 'GPU Utilization'
}
}
}
});
}
createCanvas(id) {
const canvas = document.createElement('canvas');
canvas.id = id;
this.container.appendChild(canvas);
return canvas;
}
async updateCharts() {
try {
const metrics = await client.endpoints.getMetrics(this.endpointId, {
timeframe: "1h",
granularity: "5m",
metrics: ["performance", "costs", "resources"]
});
// Update latency chart
if (metrics.performance?.latency?.timeSeries) {
const latencyData = metrics.performance.latency.timeSeries;
this.charts.latency.data.labels = latencyData.map(d =>
new Date(d.timestamp).toLocaleTimeString()
);
this.charts.latency.data.datasets[0].data = latencyData.map(d => d.avg);
this.charts.latency.data.datasets[1].data = latencyData.map(d => d.p95);
this.charts.latency.update();
}
// Update cost chart
if (metrics.costs?.breakdown) {
const breakdown = metrics.costs.breakdown;
this.charts.cost.data.datasets[0].data = [
breakdown.compute,
breakdown.storage,
breakdown.network,
breakdown.other
];
this.charts.cost.update();
}
// Update GPU chart
if (metrics.resources?.gpu?.utilization) {
const utilization = metrics.resources.gpu.utilization.average;
this.charts.gpu.data.datasets[0].data = [utilization, 100 - utilization];
this.charts.gpu.update();
}
} catch (error) {
console.error('Error updating charts:', error);
}
}
startRealTimeUpdates() {
// Update every 30 seconds
setInterval(() => this.updateCharts(), 30000);
// Initial update
this.updateCharts();
}
}
// Cost optimization analyzer
async function analyzeCostOptimization(endpointId, timeframe = "30d") {
const metrics = await client.endpoints.getMetrics(endpointId, {
timeframe,
metrics: ["costs"]
});
const costs = metrics.costs;
console.log(`Cost Optimization Analysis for ${endpointId}:`);
console.log(`Current Monthly Cost: $${costs.total.toFixed(2)}`);
if (costs.optimization?.potentialSavings > 0) {
console.log(`\nOptimization Opportunities:`);
console.log(`Total Potential Savings: $${costs.optimization.potentialSavings.toFixed(2)}`);
costs.optimization.suggestions.forEach((suggestion, index) => {
console.log(`\n${index + 1}. ${suggestion.description}`);
console.log(` Savings: $${suggestion.savings.toFixed(2)}`);
console.log(` Confidence: ${(suggestion.confidence * 100).toFixed(0)}%`);
console.log(` Effort: ${suggestion.effort}`);
});
// Calculate ROI for each suggestion
const roiAnalysis = costs.optimization.suggestions.map(suggestion => ({
...suggestion,
roi: (suggestion.savings / costs.total) * 100,
monthlyImpact: suggestion.savings * 12
}));
console.log(`\nROI Analysis:`);
roiAnalysis.forEach(suggestion => {
console.log(`${suggestion.category}: ${suggestion.roi.toFixed(1)}% savings`);
});
}
return costs;
}
// Performance alerting system
class PerformanceAlerting {
constructor(endpointIds, thresholds = {}) {
this.endpointIds = endpointIds;
this.thresholds = {
maxLatency: 5.0,
minSuccessRate: 95.0,
maxErrorRate: 5.0,
maxCostIncrease: 20.0,
...thresholds
};
this.alerts = [];
}
async checkAlerts() {
for (const endpointId of this.endpointIds) {
try {
const metrics = await client.endpoints.getMetrics(endpointId, {
timeframe: "1h",
metrics: ["performance", "costs"]
});
await this.evaluateMetrics(endpointId, metrics);
} catch (error) {
this.addAlert(endpointId, 'error', `Failed to fetch metrics: ${error.message}`);
}
}
return this.alerts;
}
async evaluateMetrics(endpointId, metrics) {
const summary = metrics.summary;
// Latency alerts
if (summary.averageLatency > this.thresholds.maxLatency) {
this.addAlert(endpointId, 'warning',
`High latency: ${summary.averageLatency.toFixed(2)}s (threshold: ${this.thresholds.maxLatency}s)`
);
}
// Success rate alerts
if (summary.successRate < this.thresholds.minSuccessRate) {
this.addAlert(endpointId, 'critical',
`Low success rate: ${summary.successRate.toFixed(2)}% (threshold: ${this.thresholds.minSuccessRate}%)`
);
}
// Error rate alerts
const errorRate = 100 - summary.successRate;
if (errorRate > this.thresholds.maxErrorRate) {
this.addAlert(endpointId, 'warning',
`High error rate: ${errorRate.toFixed(2)}% (threshold: ${this.thresholds.maxErrorRate}%)`
);
}
// Cost increase alerts
if (metrics.costs?.trends?.daily?.changePercent > this.thresholds.maxCostIncrease) {
this.addAlert(endpointId, 'warning',
`Cost increase: ${metrics.costs.trends.daily.changePercent.toFixed(1)}% (threshold: ${this.thresholds.maxCostIncrease}%)`
);
}
}
addAlert(endpointId, severity, message) {
this.alerts.push({
endpointId,
severity,
message,
timestamp: new Date().toISOString()
});
}
async sendNotifications() {
if (this.alerts.length === 0) return;
// Group alerts by severity
const alertsBySeverity = this.alerts.reduce((acc, alert) => {
acc[alert.severity] = acc[alert.severity] || [];
acc[alert.severity].push(alert);
return acc;
}, {});
console.log('Performance Alerts:');
Object.entries(alertsBySeverity).forEach(([severity, alerts]) => {
console.log(`\n${severity.toUpperCase()} (${alerts.length}):`);
alerts.forEach(alert => {
console.log(` ${alert.endpointId}: ${alert.message}`);
});
});
// Here you would integrate with your notification system
// (email, Slack, PagerDuty, etc.)
}
}
// Usage examples
async function main() {
const endpointId = "ep_1234567890abcdef";
const endpointIds = ["ep_1234567890abcdef", "ep_2345678901bcdefg"];
try {
// Basic metrics
const metrics = await getEndpointMetrics(endpointId);
// Cost optimization
await analyzeCostOptimization(endpointId);
// Create real-time dashboard (if running in browser)
// const dashboard = new MetricsDashboard(endpointId, 'dashboard-container');
// Set up performance alerting
const alerting = new PerformanceAlerting(endpointIds, {
maxLatency: 3.0,
minSuccessRate: 98.0,
maxErrorRate: 2.0,
maxCostIncrease: 15.0
});
const alerts = await alerting.checkAlerts();
await alerting.sendNotifications();
} catch (error) {
console.error("Metrics analysis error:", error);
}
}
main();
Use Cases
Production Monitoring
- SLA Monitoring: Track performance against service level agreements
- Cost Control: Monitor spending and identify optimization opportunities
- Capacity Planning: Analyze usage patterns for infrastructure planning
- Performance Optimization: Identify bottlenecks and optimization opportunities
Business Intelligence
- Usage Analytics: Understand user behavior and usage patterns
- Cost Attribution: Track costs by team, project, or customer
- ROI Analysis: Measure return on investment for AI initiatives
- Trend Analysis: Identify growth patterns and seasonal variations
DevOps and Automation
- Auto-scaling: Use metrics to trigger scaling decisions
- Alerting: Set up automated alerts for performance and cost thresholds
- CI/CD Integration: Include performance metrics in deployment pipelines
- Resource Optimization: Automatically optimize resource allocation
Research and Development
- Model Performance: Compare different model versions and configurations
- A/B Testing: Analyze performance differences between model variants
- Optimization Research: Identify areas for model and infrastructure improvement
- Benchmarking: Compare performance against industry standards
Best Practices
Monitoring Strategy
- Key Metrics: Focus on metrics that align with business objectives
- Alerting Thresholds: Set appropriate thresholds to avoid alert fatigue
- Dashboards: Create role-specific dashboards for different stakeholders
- Historical Analysis: Maintain historical data for trend analysis
Performance Optimization
- Regular Review: Regularly review metrics to identify optimization opportunities
- Baseline Establishment: Establish performance baselines for comparison
- Proactive Monitoring: Monitor trends to prevent issues before they occur
- Cross-correlation: Analyze relationships between different metrics
Cost Management
- Budget Tracking: Set up budget alerts and tracking
- Cost Attribution: Tag resources for accurate cost attribution
- Optimization Cycles: Regularly review and implement cost optimizations
- Forecasting: Use historical data for cost forecasting and planning
Data Quality
- Metric Validation: Validate metric accuracy and completeness
- Data Retention: Define appropriate data retention policies
- Access Control: Implement proper access controls for sensitive metrics
- Documentation: Document metric definitions and calculation methods
Metrics are calculated in real-time and cached for 5 minutes to ensure optimal performance. For the most
current data, use granularities of 1m or 5m.
Historical metrics are retained for 90 days. Export important data for longer-term analysis and archival
purposes.
Use the batch metrics endpoint to efficiently retrieve metrics for multiple endpoints. Set up automated
reporting and alerting to proactively manage performance and costs.
Authorizations
API key authentication. Use 'Bearer YOUR_API_KEY' format.
Path Parameters
Response
200 - application/json
Endpoint metrics
The response is of type object
.