{
  "evaluationId": "eval_abc123_001",
  "status": "queued",
  "estimatedDuration": "25m",
  "progress": {
    "samplesProcessed": 0,
    "totalSamples": 10000,
    "percentComplete": 0.0,
    "currentMetrics": {}
  },
  "createdAt": "2024-01-16T10:00:00Z"
}
Evaluate trained models on custom test datasets with comprehensive metrics and analysis. This endpoint allows you to assess model performance beyond basic training metrics.

Path Parameters

jobId
string
required
Unique identifier of the training job to evaluate

Request Body

checkpointId
string
Specific checkpoint to evaluate (defaults to best checkpoint if not specified)
evaluationName
string
required
Human-readable name for this evaluation run
testDataset
object
required
Test dataset configuration
metrics
array
List of metrics to compute during evaluation
evaluationConfig
object
Additional evaluation configuration
comparisonBaselines
array
Optional baselines to compare against

Response

evaluationId
string
Unique identifier for this evaluation run
status
string
Evaluation status: queued, running, completed, failed
estimatedDuration
string
Estimated time to complete evaluation
progress
object
Real-time evaluation progress

Example

cURL
curl -X POST "https://api.tensorone.ai/v2/training/jobs/job_train_abc123/evaluate" \
  -H "Authorization: Bearer YOUR_API_KEY" \
  -H "Content-Type: application/json" \
  -d '{
    "evaluationName": "Final Model Evaluation",
    "checkpointId": "ckpt_best_abc123",
    "testDataset": {
      "datasetId": "ds_test_456",
      "format": "json"
    },
    "metrics": ["accuracy", "precision", "recall", "f1_score", "confusion_matrix"],
    "evaluationConfig": {
      "batchSize": 32,
      "generateReports": true
    },
    "comparisonBaselines": [
      {
        "baselineType": "random",
        "baselineConfig": {}
      }
    ]
  }'
Python
import requests

# Start comprehensive evaluation
response = requests.post(
    "https://api.tensorone.ai/v2/training/jobs/job_train_abc123/evaluate",
    headers={
        "Authorization": "Bearer YOUR_API_KEY",
        "Content-Type": "application/json"
    },
    json={
        "evaluationName": "Production Readiness Test",
        "checkpointId": "ckpt_best_abc123",
        "testDataset": {
            "datasetUrl": "s3://my-bucket/test-data/",
            "format": "parquet",
            "sampleSize": 10000
        },
        "metrics": [
            "accuracy", "precision", "recall", "f1_score", 
            "auc_roc", "confusion_matrix"
        ],
        "evaluationConfig": {
            "batchSize": 64,
            "generateReports": True,
            "saveResults": True
        },
        "comparisonBaselines": [
            {
                "baselineType": "pretrained_model",
                "baselineConfig": {
                    "modelName": "bert-base-uncased"
                }  
            }
        ]
    }
)

evaluation = response.json()
print(f"Started evaluation: {evaluation['evaluationId']}")
print(f"Estimated duration: {evaluation['estimatedDuration']}")

# Monitor evaluation progress
import time

while True:
    progress_response = requests.get(
        f"https://api.tensorone.ai/v2/training/evaluations/{evaluation['evaluationId']}",
        headers={"Authorization": "Bearer YOUR_API_KEY"}
    )
    
    eval_data = progress_response.json()
    
    if eval_data['status'] == 'completed':
        print("Evaluation completed!")
        print(f"Final accuracy: {eval_data['results']['metrics']['accuracy']:.3f}")
        break
    elif eval_data['status'] == 'running':
        progress = eval_data['progress']['percentComplete']
        print(f"Progress: {progress:.1f}%")
    
    time.sleep(30)
JavaScript
const response = await fetch('https://api.tensorone.ai/v2/training/jobs/job_train_abc123/evaluate', {
  method: 'POST',
  headers: {
    'Authorization': 'Bearer YOUR_API_KEY',
    'Content-Type': 'application/json'
  },
  body: JSON.stringify({
    evaluationName: 'Model Performance Benchmark',
    testDataset: {
      datasetId: 'ds_benchmark_789',
      format: 'json'
    },
    metrics: ['accuracy', 'f1_score', 'perplexity'],
    evaluationConfig: {
      batchSize: 16,
      generateReports: true
    }
  })
});

const evaluation = await response.json();
console.log('Evaluation started:', evaluation);
{
  "evaluationId": "eval_abc123_001",
  "status": "queued",
  "estimatedDuration": "25m",
  "progress": {
    "samplesProcessed": 0,
    "totalSamples": 10000,
    "percentComplete": 0.0,
    "currentMetrics": {}
  },
  "createdAt": "2024-01-16T10:00:00Z"
}

Get Evaluation Results

Retrieve detailed results from a completed evaluation:
cURL
curl -X GET "https://api.tensorone.ai/v2/training/evaluations/eval_abc123_001" \
  -H "Authorization: Bearer YOUR_API_KEY"
Python
# Get detailed evaluation results
response = requests.get(
    "https://api.tensorone.ai/v2/training/evaluations/eval_abc123_001",
    headers={"Authorization": "Bearer YOUR_API_KEY"}
)

results = response.json()

if results['status'] == 'completed':
    metrics = results['results']['metrics']
    print("=== Evaluation Results ===")
    print(f"Accuracy: {metrics['accuracy']:.3f}")
    print(f"Precision: {metrics['precision']:.3f}")
    print(f"Recall: {metrics['recall']:.3f}")
    print(f"F1 Score: {metrics['f1_score']:.3f}")
    
    # Download detailed report
    if 'reportUrl' in results['results']:
        report_response = requests.get(results['results']['reportUrl'])
        with open('evaluation_report.pdf', 'wb') as f:
            f.write(report_response.content)
        print("Downloaded detailed evaluation report")
{
  "evaluationId": "eval_abc123_001",
  "status": "completed",
  "evaluationName": "Final Model Evaluation",
  "jobId": "job_train_abc123",
  "checkpointId": "ckpt_best_abc123",
  "results": {
    "metrics": {
      "accuracy": 0.8756,
      "precision": 0.8623,
      "recall": 0.8891,
      "f1_score": 0.8755,
      "auc_roc": 0.9234,
      "confusion_matrix": [
        [850, 23, 45, 12],
        [18, 892, 34, 21],
        [32, 28, 876, 19],
        [15, 19, 22, 901]
      ]
    },
    "classMetrics": {
      "class_0": {"precision": 0.89, "recall": 0.91, "f1": 0.90},
      "class_1": {"precision": 0.87, "recall": 0.88, "f1": 0.875},
      "class_2": {"precision": 0.84, "recall": 0.86, "f1": 0.85},
      "class_3": {"precision": 0.91, "recall": 0.89, "f1": 0.90}
    },
    "baselineComparison": {
      "random_baseline": {
        "accuracy": 0.2489,
        "improvement": "+250.7%"
      }
    },
    "reportUrl": "https://reports.tensorone.ai/evaluations/eval_abc123_001.pdf",
    "rawResultsUrl": "https://results.tensorone.ai/eval_abc123_001.json"
  },
  "executionDetails": {
    "samplesEvaluated": 10000,
    "evaluationTime": "23m 45s",
    "averageInferenceTime": "12.3ms",
    "resourceUsage": {
      "gpuUtilization": 85.2,
      "memoryUsage": "8.2GB"
    }
  },
  "createdAt": "2024-01-16T10:00:00Z",
  "completedAt": "2024-01-16T10:23:45Z"
}

Evaluation Types

Standard Evaluation

Basic model performance assessment on test data:
{
    "metrics": ["accuracy", "loss"],
    "testDataset": {"datasetId": "ds_test_123"}
}

Comprehensive Evaluation

Detailed analysis with multiple metrics and baselines:
{
    "metrics": ["accuracy", "precision", "recall", "f1_score", "confusion_matrix"],
    "comparisonBaselines": [
        {"baselineType": "random"},
        {"baselineType": "majority_class"}
    ]
}

A/B Testing Evaluation

Compare multiple model versions:
{
    "evaluationName": "Model A vs Model B",
    "checkpointComparison": [
        "ckpt_model_a_123",
        "ckpt_model_b_456"
    ],
    "metrics": ["accuracy", "latency", "memory_usage"]
}

Domain-Specific Evaluation

Custom metrics for specialized domains:
{
    "metrics": ["bleu_score", "rouge_score", "bertscore"],  # NLP
    "evaluationConfig": {
        "customMetrics": [
            {
                "name": "semantic_similarity",
                "function": "cosine_similarity",
                "parameters": {"threshold": 0.8}
            }
        ]
    }
}

Advanced Evaluation Features

Error Analysis

Automatically analyze common failure patterns:
{
    "evaluationConfig": {
        "errorAnalysis": {
            "enabled": True,
            "sampleFailureCases": 100,
            "categorizeErrors": True
        }
    }
}

Fairness Assessment

Evaluate model fairness across different groups:
{
    "evaluationConfig": {
        "fairnessMetrics": {
            "enabled": True,
            "protectedAttributes": ["gender", "age_group"],
            "metrics": ["demographic_parity", "equalized_odds"]
        }
    }
}

Robustness Testing

Test model performance under various conditions:
{
    "evaluationConfig": {
        "robustnessTests": {
            "enabled": True,
            "testTypes": ["adversarial_examples", "data_drift", "noise_injection"],
            "severityLevels": ["low", "medium", "high"]
        }
    }
}

Evaluation Reports

Detailed evaluation reports include:
  • Executive Summary: High-level performance overview
  • Metric Analysis: Detailed breakdown of all computed metrics
  • Confusion Matrix: Visual representation of classification results
  • Error Analysis: Common failure patterns and examples
  • Baseline Comparison: Performance vs baseline models
  • Recommendations: Suggestions for model improvement

Best Practices

Test Dataset Preparation

  • Use representative test data that mirrors production distribution
  • Ensure test data is completely separate from training data
  • Include edge cases and challenging examples
  • Balance dataset if needed for fair evaluation

Metric Selection

  • Choose metrics appropriate for your use case
  • Include both aggregate and per-class metrics
  • Consider business-relevant metrics beyond accuracy
  • Use multiple metrics to get comprehensive view

Evaluation Monitoring

def comprehensive_evaluation(job_id, test_dataset_id):
    """Run comprehensive model evaluation with monitoring"""
    
    # Start evaluation
    eval_response = requests.post(
        f"https://api.tensorone.ai/v2/training/jobs/{job_id}/evaluate",
        json={
            "evaluationName": f"Comprehensive Eval {datetime.now()}",
            "testDataset": {"datasetId": test_dataset_id},
            "metrics": [
                "accuracy", "precision", "recall", "f1_score",
                "auc_roc", "confusion_matrix"
            ],
            "comparisonBaselines": [
                {"baselineType": "random"},
                {"baselineType": "majority_class"}
            ],
            "evaluationConfig": {
                "generateReports": True,
                "errorAnalysis": {"enabled": True}
            }
        }
    )
    
    eval_id = eval_response.json()['evaluationId']
    
    # Monitor progress
    while True:
        status_response = requests.get(
            f"https://api.tensorone.ai/v2/training/evaluations/{eval_id}"
        )
        
        eval_data = status_response.json()
        
        if eval_data['status'] == 'completed':
            return eval_data['results']
        elif eval_data['status'] == 'failed':
            raise Exception("Evaluation failed")
        
        time.sleep(30)
Evaluation results are stored for 90 days and can be accessed anytime during this period. Download important reports for long-term storage.
Use evaluation results to make data-driven decisions about model deployment, hyperparameter tuning, and data collection strategies.