Skip to main content

Evaluation States

Every evaluation moves through a defined lifecycle:

State Descriptions

StateDescriptionDuration
pendingEvaluation created, waiting to startSeconds
runningEvaluators actively processing samplesMinutes to hours
in_reviewAutomated scoring complete, awaiting human reviewHours to days
completedAll scoring finishedFinal
failedError occurred during evaluationFinal
cancelledManually stopped by userFinal

Triggering Evaluations

Evaluations can be triggered in multiple ways:

Manual (Dashboard/SDK)

# Direct SDK call
evaluation = client.evaluations.create(
    name="Manual Evaluation",
    project="patient-triage",
    dataset="ds_abc123",
    evaluators=[
        {"type": "triage_accuracy"},
        {"type": "red_flag_detection"}
    ]
)

CI/CD Integration

Automatically run evaluations on code changes:
# .github/workflows/evaluate.yml
name: Model Evaluation

on:
  push:
    branches: [main]
    paths:
      - 'models/**'
      - 'prompts/**'

jobs:
  evaluate:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v3
      
      - name: Run Rubric Evaluation
        env:
          RUBRIC_API_KEY: ${{ secrets.RUBRIC_API_KEY }}
        run: |
          pip install rubric
          python scripts/run_evaluation.py
# scripts/run_evaluation.py
from rubric import Rubric
import sys

client = Rubric()

evaluation = client.evaluations.create(
    name=f"CI Eval - {os.environ['GITHUB_SHA'][:7]}",
    project="patient-triage",
    dataset="ds_golden_test",
    evaluators=[
        {"type": "triage_accuracy"},
        {"type": "red_flag_detection"}
    ],
    metadata={
        "commit": os.environ["GITHUB_SHA"],
        "branch": os.environ["GITHUB_REF"],
        "triggered_by": "github_actions"
    }
)

# Wait and check results
result = client.evaluations.wait(evaluation.id, timeout=600)

# Fail CI if metrics below threshold
if result.metrics["under_triage_rate"] > 0.01:
    print(f"❌ Under-triage rate {result.metrics['under_triage_rate']:.2%} exceeds 1% threshold")
    sys.exit(1)

if result.metrics["red_flag_recall"] < 0.95:
    print(f"❌ Red flag recall {result.metrics['red_flag_recall']:.2%} below 95% threshold")
    sys.exit(1)

print("✅ All metrics within thresholds")

Scheduled Evaluations

Run evaluations on a recurring schedule:
# Configure scheduled evaluation
client.evaluations.schedule(
    name="Weekly Triage Review",
    project="patient-triage",
    dataset="ds_weekly_sample",
    evaluators=[...],
    
    schedule={
        "cron": "0 9 * * MON",  # Every Monday at 9 AM
        "timezone": "America/New_York"
    },
    
    notification={
        "on_complete": ["slack:#ai-evals", "email:[email protected]"],
        "on_failure": ["pagerduty:ai-oncall"]
    }
)

Webhook-Triggered

Trigger evaluations from external events:
# Create webhook trigger
webhook = client.webhooks.create(
    name="New Data Trigger",
    project="patient-triage",
    events=["dataset.samples.created"],
    
    action={
        "type": "create_evaluation",
        "config": {
            "evaluators": [{"type": "triage_accuracy"}],
            "min_samples": 100  # Wait until 100 new samples
        }
    }
)

# Webhook URL to configure in your data pipeline
print(f"Webhook URL: {webhook.url}")

Progress Monitoring

Polling Status

import time

evaluation = client.evaluations.create(...)

while True:
    status = client.evaluations.get_status(evaluation.id)
    
    print(f"Status: {status.status}")
    print(f"Progress: {status.completed}/{status.total} ({status.percent_complete}%)")
    
    if status.status in ["completed", "failed", "cancelled"]:
        break
    
    time.sleep(5)

Using Callbacks

# With async callback
evaluation = client.evaluations.create(
    ...,
    callback_url="https://your-server.com/webhook/evaluation-complete"
)

# Your webhook receives:
# {
#   "event": "evaluation.completed",
#   "evaluation_id": "eval_abc123",
#   "status": "completed",
#   "metrics": {...}
# }

Streaming Progress

# Real-time progress streaming
async for update in client.evaluations.stream(evaluation.id):
    print(f"Sample {update.sample_id}: {update.status}")
    
    if update.status == "scored":
        print(f"  Score: {update.score}")
    elif update.status == "failed":
        print(f"  Error: {update.error}")

Wait Helper

# Synchronous wait with timeout
try:
    result = client.evaluations.wait(
        evaluation.id,
        timeout=300,  # 5 minutes
        poll_interval=5  # Check every 5 seconds
    )
    print(f"Completed! Score: {result.score}")
    
except TimeoutError:
    print("Evaluation taking too long")
    
except EvaluationFailedError as e:
    print(f"Evaluation failed: {e.message}")

Progress States

During execution, individual samples have their own states:
StateDescription
queuedWaiting to be processed
processingEvaluator running
scoredAutomated scoring complete
flaggedNeeds human review
reviewedHuman review complete
failedError processing sample
skippedExcluded from evaluation
# Get detailed progress
progress = client.evaluations.get_progress(evaluation.id)

print(f"Total: {progress.total}")
print(f"Queued: {progress.queued}")
print(f"Processing: {progress.processing}")
print(f"Scored: {progress.scored}")
print(f"Flagged: {progress.flagged}")
print(f"Reviewed: {progress.reviewed}")
print(f"Failed: {progress.failed}")

Error Handling

Evaluation-Level Errors

try:
    result = client.evaluations.wait(evaluation.id)
    
except EvaluationFailedError as e:
    print(f"Evaluation failed: {e.message}")
    print(f"Error code: {e.code}")
    print(f"Failed at: {e.failed_at}")
    
    # Get partial results
    partial = client.evaluations.get(evaluation.id)
    print(f"Completed before failure: {partial.progress.completed}")

Sample-Level Errors

# Get failed samples
failed_samples = client.evaluations.get_samples(
    evaluation.id,
    status="failed"
)

for sample in failed_samples:
    print(f"Sample {sample.id} failed:")
    print(f"  Error: {sample.error.message}")
    print(f"  Evaluator: {sample.error.evaluator}")

Common Error Types

Error CodeDescriptionResolution
evaluator_timeoutEvaluator took too longIncrease timeout or simplify evaluator
invalid_sampleSample data malformedCheck sample schema
evaluator_errorEvaluator threw exceptionCheck evaluator logs
quota_exceededHit usage limitsUpgrade plan or wait
rate_limitedToo many concurrent evalsAdd retry logic

Retry Failed Samples

# Retry only failed samples
retry_eval = client.evaluations.retry(
    evaluation.id,
    samples="failed"  # or list of sample IDs
)

# With different configuration
retry_eval = client.evaluations.retry(
    evaluation.id,
    samples="failed",
    evaluators=[
        {
            "type": "triage_accuracy",
            "config": {"timeout": 60}  # Increased timeout
        }
    ]
)

Cancellation

# Cancel a running evaluation
client.evaluations.cancel(evaluation.id)

# Cancel with reason
client.evaluations.cancel(
    evaluation.id,
    reason="Found bug in test data"
)

# Check cancellation
status = client.evaluations.get_status(evaluation.id)
if status.status == "cancelled":
    print(f"Cancelled at: {status.cancelled_at}")
    print(f"Reason: {status.cancellation_reason}")

Best Practices

Configure timeouts based on your evaluator complexity:
evaluators=[
    {
        "type": "triage_accuracy",
        "config": {"timeout": 30}  # Simple evaluator
    },
    {
        "type": "clinical_reasoning",
        "config": {"timeout": 120}  # LLM-based evaluator
    }
]
Prevent duplicate evaluations in CI/CD:
evaluation = client.evaluations.create(
    ...,
    idempotency_key=f"ci-{commit_sha}-{dataset_id}"
)
Always handle cases where evaluation partially completes:
result = client.evaluations.get(evaluation.id)

if result.progress.failed > 0:
    print(f"Warning: {result.progress.failed} samples failed")
    # Metrics are computed on successful samples only
Configure alerts for evaluation failures:
client.projects.update(
    project="patient-triage",
    alerts=[{
        "event": "evaluation.failed",
        "channels": ["slack:#alerts", "pagerduty"]
    }]
)

Next Steps