Skip to main content

Why Version Evaluations?

Healthcare AI development is iterative. As you improve your models, you need to confidently answer: “Is this version better than the last?” Proper versioning enables:
CapabilityBenefit
ReproducibilityRe-run any historical evaluation with identical settings
ComparisonApples-to-apples comparison between model versions
Audit trailComplete history for regulatory submissions
RollbackQuickly identify when regressions were introduced
CollaborationTeam members work from shared evaluation definitions

Versioning Components

Evaluation Definitions

An evaluation definition captures everything needed to reproduce an evaluation:
evaluation_definition.py
from rubric import Rubric

client = Rubric()

# Create a versioned evaluation definition
definition = client.evaluation_definitions.create(
    name="Triage Safety Evaluation",
    version="2.1.0",

    # Semantic versioning
    # MAJOR: Breaking changes to metrics or methodology
    # MINOR: New evaluators or metrics added
    # PATCH: Bug fixes, threshold adjustments

    evaluators=[
        {
            "type": "triage_accuracy",
            "version": "1.2.0",  # Evaluator version locked
            "config": {
                "levels": ["emergent", "urgent", "semi_urgent", "routine"],
                "severity_weights": {"under_triage": 5.0, "over_triage": 1.0}
            }
        },
        {
            "type": "red_flag_detection",
            "version": "1.0.0",
            "config": {
                "protocols": ["chest_pain_v2", "stroke_fast", "sepsis_sirs"]
            }
        }
    ],

    # Scoring configuration
    aggregation={
        "method": "weighted_average",
        "weights": {"triage_accuracy": 0.5, "red_flag_detection": 0.5}
    },

    # Human review configuration
    human_review={
        "threshold": 0.7,
        "reviewer_pool": "nurse",
        "consensus_required": False
    },

    # Metadata
    description="Standard safety evaluation for voice triage models",
    tags=["safety", "triage", "production"],

    # Lock definition after validation
    status="active"
)

print(f"Created definition: {definition.id} v{definition.version}")

Dataset Versioning

Datasets are immutable once created. Modifications create new versions.
# Create initial dataset
dataset_v1 = client.datasets.create(
    name="Triage Validation Set",
    description="Q1 2025 validation cases",
    samples=[...]  # 1000 samples
)

# Dataset is immutable - create new version to add samples
dataset_v2 = client.datasets.create_version(
    parent=dataset_v1.id,
    description="Added edge cases for pediatric triage",
    add_samples=[...],  # 50 new samples
    version_notes="Added pediatric fever cases from March incidents"
)

# Track lineage
print(f"v1: {dataset_v1.id} ({dataset_v1.sample_count} samples)")
print(f"v2: {dataset_v2.id} ({dataset_v2.sample_count} samples)")
print(f"v2 parent: {dataset_v2.parent_id}")

Comparing Model Runs

Run the same evaluation definition against different model versions to get a clean comparison.
model_comparison.py
from rubric import Rubric

client = Rubric()

# Run evaluation on multiple model versions
models_to_compare = [
    {"name": "triage-v2.3", "endpoint": "https://api.myco.com/v2.3/triage"},
    {"name": "triage-v2.4", "endpoint": "https://api.myco.com/v2.4/triage"},
    {"name": "triage-v2.4-ft", "endpoint": "https://api.myco.com/v2.4-ft/triage"}
]

comparison = client.comparisons.create(
    name="Triage Model Comparison - Feb 2025",

    # Use same definition for all
    evaluation_definition="evaldef_safety_v2",

    # Use same dataset
    dataset="ds_validation_q1",

    # Compare these models
    models=models_to_compare,

    # Statistical analysis
    analysis={
        "paired_tests": True,  # McNemar's test for paired samples
        "bootstrap_ci": True,  # Bootstrap confidence intervals
        "effect_size": True    # Cohen's d for magnitude
    }
)

# Wait for completion
comparison.wait()

# View results
print(comparison.summary())

# Output:
# Model Comparison Results
# ========================
#
# Triage Accuracy:
#   triage-v2.3:    78.2% [75.1%, 81.3%]
#   triage-v2.4:    82.4% [79.5%, 85.3%]  ↑ +4.2% (p=0.003)
#   triage-v2.4-ft: 84.1% [81.3%, 86.9%]  ↑ +5.9% (p<0.001)
#
# Red Flag Detection:
#   triage-v2.3:    91.2% [88.9%, 93.5%]
#   triage-v2.4:    93.8% [91.7%, 95.9%]  ↑ +2.6% (p=0.021)
#   triage-v2.4-ft: 95.2% [93.3%, 97.1%]  ↑ +4.0% (p=0.002)
#
# Recommendation: triage-v2.4-ft shows statistically significant
# improvement across all metrics.

Regression Detection

# Check for regressions before deployment
regression_check = client.comparisons.check_regression(
    baseline="eval_v2.3_production",
    candidate="eval_v2.4_staging",

    # Define regression thresholds
    thresholds={
        "triage_accuracy": {"max_decrease": 0.02},  # No more than 2% drop
        "red_flag_sensitivity": {"max_decrease": 0.01},  # 1% max drop
        "under_triage_rate": {"max_increase": 0.005}  # Can't increase under-triage
    }
)

if regression_check.passed:
    print("✅ No regressions detected - safe to deploy")
else:
    print("❌ Regressions detected:")
    for regression in regression_check.regressions:
        print(f"  - {regression.metric}: {regression.change:+.1%} (threshold: {regression.threshold})")

Reproducibility Guarantees

Every evaluation can be exactly reproduced, which is essential for regulatory submissions and scientific validation.
reproducibility.py
# Get complete reproducibility manifest
manifest = client.evaluations.manifest("eval_abc123")

print(f"""
Evaluation Manifest
===================
ID: {manifest.evaluation_id}
Created: {manifest.created_at}

Evaluation Definition:
  ID: {manifest.definition_id}
  Version: {manifest.definition_version}
  Hash: {manifest.definition_hash}

Dataset:
  ID: {manifest.dataset_id}
  Version: {manifest.dataset_version}
  Sample Count: {manifest.sample_count}
  Hash: {manifest.dataset_hash}

Evaluators:
""")

for evaluator in manifest.evaluators:
    print(f"""  - {evaluator.type}
      Version: {evaluator.version}
      Config Hash: {evaluator.config_hash}
""")

print(f"""
Random Seed: {manifest.random_seed}
Compute Environment:
  Platform: {manifest.environment.platform}
  Version: {manifest.environment.version}

Reproducibility Hash: {manifest.reproducibility_hash}
""")
Reproducibility Hash: The reproducibility hash is a cryptographic fingerprint of all evaluation inputs. Two evaluations with the same hash are guaranteed to produce identical results.

Re-running Historical Evaluations

# Reproduce a historical evaluation exactly
reproduced = client.evaluations.reproduce(
    source="eval_abc123",

    # Optional: override specific components
    overrides={
        "dataset": "ds_updated_v2"  # Use newer dataset
    }
)

# Compare results
original = client.evaluations.get("eval_abc123")

print(f"""
Original vs Reproduced:
  Triage Accuracy: {original.triage_accuracy:.1%} vs {reproduced.triage_accuracy:.1%}
  Same results: {original.triage_accuracy == reproduced.triage_accuracy}
""")

Version Control Integration

Sync evaluation definitions with your Git repository for complete traceability.
evaluations/triage_safety.yaml
# evaluations/triage_safety.yaml
# Evaluation definition stored in version control

name: Triage Safety Evaluation
version: 2.1.0

evaluators:
  - type: triage_accuracy
    version: 1.2.0
    config:
      levels: [emergent, urgent, semi_urgent, routine]
      severity_weights:
        under_triage: 5.0
        over_triage: 1.0

  - type: red_flag_detection
    version: 1.0.0
    config:
      protocols:
        - chest_pain_v2
        - stroke_fast
        - sepsis_sirs

aggregation:
  method: weighted_average
  weights:
    triage_accuracy: 0.5
    red_flag_detection: 0.5

thresholds:
  pass:
    triage_accuracy: 0.85
    red_flag_sensitivity: 0.95
sync_definitions.py
from rubric import Rubric

client = Rubric()

# Sync definitions from repository
client.evaluation_definitions.sync_from_repo(
    repo="github.com/myorg/ml-evaluations",
    branch="main",
    path="evaluations/",

    # Create new versions for changed files
    auto_version=True,

    # Notify on changes
    webhook="https://slack.com/webhook/..."
)

Changelog & History

# View evaluation definition history
history = client.evaluation_definitions.history("evaldef_triage_safety")

for version in history.versions:
    print(f"""
Version: {version.version}
Created: {version.created_at}
Author: {version.created_by}
Status: {version.status}
Changes: {version.change_summary}
---""")
VersionDateChanges
2.1.0Feb 15, 2025Added sepsis protocol to red flag detection
2.0.0Jan 10, 2025Breaking: Changed triage level names to match EMR
1.2.1Dec 5, 2024Fixed edge case in pediatric weighting
1.2.0Nov 20, 2024Added pediatric context adjustments
1.1.0Oct 15, 2024Added hallucination detection evaluator
1.0.0Sep 1, 2024Initial release