Skip to main content

Overview

Rubric’s data model is built around six core objects that work together to enable healthcare AI evaluation:

Datasets

Collections of samples for evaluation

Tasks

Individual items needing human review

Models

AI models being evaluated

Evaluations

Automated scoring runs

Reviewers

Clinicians who review AI outputs

Scores

Evaluation results and metrics

Datasets

Datasets are collections of samples that share a common purpose — typically test sets for evaluation.

Schema

{
  "id": "ds_abc123",
  "object": "dataset",
  "name": "Triage Golden Set Q1 2025",
  "description": "Quarterly evaluation test set",
  "project": "proj_xyz789",
  "sample_count": 247,
  "created_at": "2025-01-15T10:30:00Z",
  "updated_at": "2025-01-15T14:22:00Z",
  "tags": ["golden-set", "quarterly"],
  "metadata": {
    "annotator": "clinical_team",
    "version": "3"
  }
}

Usage

# Create a dataset
dataset = client.datasets.create(
    name="Triage Golden Set Q1 2025",
    description="Quarterly evaluation test set",
    project="patient-triage",
    tags=["golden-set", "quarterly"]
)

# List datasets
datasets = client.datasets.list(project="patient-triage")

# Get dataset details
dataset = client.datasets.get("ds_abc123")

# Update dataset
client.datasets.update(
    "ds_abc123",
    description="Updated description"
)

# Delete dataset
client.datasets.delete("ds_abc123")

Samples

Samples are the individual data points within a dataset:
{
  "id": "smp_def456",
  "object": "sample",
  "dataset": "ds_abc123",
  "input": {
    "transcript": [...],
    "audio_url": "s3://bucket/call.wav"
  },
  "output": {
    "triage_level": "urgent",
    "symptoms": ["chest_pain"]
  },
  "expected": {
    "triage_level": "emergency"
  },
  "metadata": {
    "call_id": "call_123"
  },
  "created_at": "2025-01-15T10:30:00Z"
}

Tasks

Tasks represent individual items requiring human review. They’re created automatically when AI outputs need clinical oversight.

Schema

{
  "id": "task_ghi789",
  "object": "task",
  "type": "triage_review",
  "status": "pending",
  "priority": "high",
  "sample": "smp_def456",
  "project": "proj_xyz789",
  "assigned_to": null,
  "created_at": "2025-01-15T10:30:00Z",
  "due_at": "2025-01-15T18:00:00Z",
  "flag_reason": "low_confidence",
  "metadata": {
    "confidence_score": 0.72,
    "red_flags_detected": true
  }
}

Task Status Flow

pending → assigned → in_progress → completed
                  ↘            ↗
                   → skipped →
StatusDescription
pendingAwaiting assignment
assignedAssigned to a reviewer
in_progressReviewer is actively working
completedReview submitted
skippedReviewer skipped (reassigned)

Usage

# List pending tasks
tasks = client.tasks.list(
    project="patient-triage",
    status="pending"
)

# Get task details
task = client.tasks.get("task_ghi789")

# Assign task to reviewer
client.tasks.assign(
    "task_ghi789",
    reviewer="rev_jkl012"
)

# Complete a task with review
client.tasks.complete(
    "task_ghi789",
    review={
        "triage_correct": False,
        "correct_triage": "emergency",
        "notes": "Missed cardiac symptoms"
    }
)

Models

Models represent the AI systems being evaluated. Track different versions and configurations.

Schema

{
  "id": "mod_mno345",
  "object": "model",
  "name": "Triage Voice Agent",
  "description": "Production triage model",
  "project": "proj_xyz789",
  "versions": [
    {
      "version": "v2.3.1",
      "created_at": "2025-01-10T10:00:00Z",
      "config": {
        "temperature": 0.3,
        "max_tokens": 1024
      }
    },
    {
      "version": "v2.3.0",
      "created_at": "2025-01-01T10:00:00Z"
    }
  ],
  "current_version": "v2.3.1",
  "created_at": "2024-06-15T10:00:00Z"
}

Usage

# Register a model
model = client.models.register(
    name="Triage Voice Agent",
    project="patient-triage",
    version="v2.3.1",
    config={
        "temperature": 0.3,
        "max_tokens": 1024,
        "base_model": "claude-sonnet-4-20250514"
    }
)

# List models
models = client.models.list(project="patient-triage")

# Get model details
model = client.models.get("mod_mno345")

# Add new version
client.models.add_version(
    "mod_mno345",
    version="v2.4.0",
    config={...}
)

Evaluations

Evaluations are automated scoring runs that assess AI performance against a dataset.

Schema

{
  "id": "eval_pqr678",
  "object": "evaluation",
  "name": "Weekly Triage Evaluation",
  "status": "completed",
  "project": "proj_xyz789",
  "dataset": "ds_abc123",
  "model": "mod_mno345",
  "evaluators": [
    {"type": "triage_accuracy", "config": {...}},
    {"type": "red_flag_detection", "config": {...}}
  ],
  "progress": {
    "total": 247,
    "completed": 247,
    "failed": 3
  },
  "metrics": {
    "triage_accuracy": 0.89,
    "red_flag_recall": 0.97,
    "under_triage_rate": 0.02
  },
  "created_at": "2025-01-15T10:30:00Z",
  "started_at": "2025-01-15T10:30:05Z",
  "completed_at": "2025-01-15T10:35:22Z"
}

Evaluation Status Flow

pending → running → completed
              ↘         ↗
               → failed

               → cancelled

Usage

# Create evaluation
evaluation = client.evaluations.create(
    name="Weekly Triage Evaluation",
    project="patient-triage",
    dataset="ds_abc123",
    evaluators=[
        {"type": "triage_accuracy"},
        {"type": "red_flag_detection"}
    ]
)

# Check status
status = client.evaluations.get_status(evaluation.id)

# Wait for completion
result = client.evaluations.wait(evaluation.id)

# Get detailed results
results = client.evaluations.get(evaluation.id)

# Get per-sample results
samples = client.evaluations.get_samples(evaluation.id)

# Compare evaluations
comparison = client.evaluations.compare([eval1.id, eval2.id])

Reviewers

Reviewers are clinicians who provide human oversight on AI outputs.

Schema

{
  "id": "rev_stu901",
  "object": "reviewer",
  "user": "user_abc123",
  "name": "Dr. Sarah Chen",
  "email": "[email protected]",
  "credentials": [
    {
      "type": "MD",
      "license_number": "MD12345",
      "state": "CA",
      "verified": true,
      "expires_at": "2026-12-31"
    }
  ],
  "specialties": ["internal_medicine", "cardiology"],
  "capacity": {
    "max_daily_tasks": 50,
    "current_assigned": 12
  },
  "stats": {
    "total_reviews": 1247,
    "avg_review_time_seconds": 180,
    "agreement_rate": 0.94
  },
  "created_at": "2024-06-15T10:00:00Z"
}

Credential Types

TypeDescriptionCan Review
MDDoctor of MedicineAll clinical decisions
DODoctor of OsteopathyAll clinical decisions
NPNurse PractitionerTriage, symptom assessment
PAPhysician AssistantTriage, symptom assessment
RNRegistered NurseProtocol compliance, documentation
LPNLicensed Practical NurseBasic documentation review

Usage

# Add a reviewer
reviewer = client.reviewers.create(
    user="user_abc123",
    credentials=[{
        "type": "MD",
        "license_number": "MD12345",
        "state": "CA"
    }],
    specialties=["internal_medicine"]
)

# List reviewers
reviewers = client.reviewers.list(
    credential_type="MD",
    available=True
)

# Get reviewer stats
stats = client.reviewers.get_stats("rev_stu901")

# Update capacity
client.reviewers.update(
    "rev_stu901",
    capacity={"max_daily_tasks": 75}
)

Scores

Scores are the evaluation results, both from automated evaluators and human reviewers.

Schema

{
  "id": "scr_vwx234",
  "object": "score",
  "sample": "smp_def456",
  "evaluation": "eval_pqr678",
  "source": "evaluator",
  "evaluator_type": "triage_accuracy",
  "value": 0.0,
  "passed": false,
  "details": {
    "predicted": "urgent",
    "expected": "emergency",
    "error_type": "under_triage"
  },
  "created_at": "2025-01-15T10:32:00Z"
}

Score Sources

SourceDescription
evaluatorAutomated evaluation score
reviewerHuman reviewer score
consensusAggregated from multiple reviewers

Usage

# Get scores for an evaluation
scores = client.scores.list(evaluation="eval_pqr678")

# Get scores for a sample
scores = client.scores.list(sample="smp_def456")

# Get score details
score = client.scores.get("scr_vwx234")

# Aggregate scores
aggregates = client.scores.aggregate(
    evaluation="eval_pqr678",
    group_by="evaluator_type"
)

Object Relationships

Next Steps