What You’ll Build
Voice-based patient triage is one of the highest-stakes healthcare AI applications. In this tutorial, you’ll set up evaluation for a voice agent that:- Triages patient symptoms to appropriate urgency levels
- Detects clinical red flags that require immediate attention
- Routes patients to the right care pathway
- Provides appropriate self-care guidance when safe
| Estimated Time | Prerequisites | Difficulty |
|---|---|---|
| 45 minutes | Python 3.8+, Rubric API key, Audio files | Intermediate |
Step 1: Prepare Voice Call Data
Voice agent evaluation requires both audio and transcripts. Rubric can process audio files directly or accept pre-transcribed conversations.prepare_dataset.py
Copy
Ask AI
from rubric import Rubric
client = Rubric()
# Create dataset for voice triage evaluation
dataset = client.datasets.create(
name="Voice Triage Evaluation - Emergency Calls",
modality="voice",
schema={
"input": {
"audio_url": "string", # S3, GCS, or HTTP URL
"transcript": "array", # Optional: pre-transcribed
"patient_demographics": "object"
},
"ai_output": {
"triage_level": "string",
"extracted_symptoms": "array",
"red_flags_detected": "array",
"disposition": "string",
"confidence": "number"
},
"expected_output": {
"triage_level": "string",
"required_red_flags": "array",
"appropriate_dispositions": "array"
},
"metadata": {
"call_duration_seconds": "number",
"chief_complaint_category": "string"
}
}
)
print(f"Created dataset: {dataset.id}")
Upload Voice Call Samples
upload_samples.py
Copy
Ask AI
# Sample with audio file
sample_with_audio = {
"input": {
"audio_url": "s3://my-bucket/calls/call_12345.wav",
"patient_demographics": {
"age": 67,
"sex": "male",
"known_conditions": ["hypertension", "type_2_diabetes"]
}
},
"ai_output": {
"triage_level": "semi_urgent",
"extracted_symptoms": [
{"symptom": "chest_discomfort", "duration": "2_days", "severity": "mild"},
{"symptom": "fatigue", "duration": "1_week", "severity": "moderate"}
],
"red_flags_detected": [],
"disposition": "schedule_cardiology_48h",
"confidence": 0.78
},
"expected_output": {
"triage_level": "urgent", # Should be MORE urgent
"required_red_flags": ["chest_pain_cardiac_risk"],
"appropriate_dispositions": ["er_referral", "same_day_urgent_care"]
},
"metadata": {
"call_duration_seconds": 245,
"chief_complaint_category": "chest_pain"
}
}
# Sample with pre-transcribed conversation
sample_with_transcript = {
"input": {
"transcript": [
{
"speaker": "agent",
"text": "Thank you for calling. How can I help you today?",
"start": 0.0,
"end": 2.5
},
{
"speaker": "patient",
"text": "I've been having really bad chest pain since yesterday. It feels like pressure.",
"start": 3.0,
"end": 8.2
},
{
"speaker": "agent",
"text": "I'm sorry to hear that. Does the pain go anywhere else, like your arm or jaw?",
"start": 8.5,
"end": 12.0
},
{
"speaker": "patient",
"text": "Yeah, actually my left arm feels numb and tingly.",
"start": 12.5,
"end": 15.8
}
# ... more turns
],
"patient_demographics": {"age": 58, "sex": "male"}
},
"ai_output": {
"triage_level": "emergent",
"extracted_symptoms": [
{"symptom": "chest_pressure", "duration": "1_day", "severity": "severe"},
{"symptom": "left_arm_numbness", "severity": "present"}
],
"red_flags_detected": ["chest_pain_radiation", "cardiac_symptoms"],
"disposition": "call_911",
"confidence": 0.95
},
"expected_output": {
"triage_level": "emergent",
"required_red_flags": ["chest_pain_radiation", "cardiac_symptoms"],
"appropriate_dispositions": ["call_911", "er_immediate"]
},
"metadata": {
"call_duration_seconds": 180,
"chief_complaint_category": "chest_pain"
}
}
# Batch upload
result = client.samples.create_batch(
dataset=dataset.id,
samples=[sample_with_audio, sample_with_transcript]
)
print(f"Uploaded {result.created_count} samples")
Step 2: Configure Voice-Specific Evaluators
voice_evaluators.py
Copy
Ask AI
evaluation_config = {
"evaluators": [
# Triage Accuracy - Critical for patient safety
{
"type": "triage_accuracy",
"weight": 0.35,
"config": {
"levels": ["emergent", "urgent", "semi_urgent", "routine", "self_care"],
# Asymmetric weighting - under-triage is MUCH worse
"severity_weights": {
"under_triage_1": 3.0, # 1 level less urgent than needed
"under_triage_2": 8.0, # 2 levels less urgent
"under_triage_3+": 15.0, # Critically under-triaged
"over_triage_1": 0.5, # Slightly over-cautious (acceptable)
"over_triage_2+": 1.5 # Very over-cautious
},
# Higher penalties for high-risk populations
"population_adjustments": {
"age_over_65": 1.3,
"cardiac_history": 1.5,
"pediatric": 1.4,
"pregnancy": 1.5
}
}
},
# Red Flag Detection - Must catch dangerous symptoms
{
"type": "red_flag_detection",
"weight": 0.35,
"config": {
"protocols": {
"chest_pain": {
"required_checks": [
"pain_character",
"radiation",
"associated_symptoms",
"cardiac_risk_factors",
"onset_timing"
],
"red_flags": [
"pressure_crushing_quality",
"radiation_arm_jaw_back",
"diaphoresis",
"dyspnea",
"syncope"
],
"escalation_threshold": 1 # Any red flag = escalate
},
"stroke": {
"required_checks": ["face", "arms", "speech", "time"],
"red_flags": [
"facial_droop",
"arm_weakness",
"speech_difficulty",
"sudden_onset"
],
"escalation_threshold": 1
},
"pediatric_fever": {
"required_checks": [
"age",
"temperature",
"activity_level",
"feeding"
],
"red_flags": [
"age_under_3_months",
"temp_over_38",
"lethargy",
"poor_feeding",
"rash_non_blanching"
],
"escalation_threshold": 1
}
},
# Zero tolerance for missed critical red flags
"critical_miss_penalty": 100, # Fails entire evaluation
"partial_miss_penalty": 25
}
},
# Conversation Quality - Was the interview thorough?
{
"type": "conversation_completeness",
"weight": 0.15,
"config": {
"required_elements": {
"all_calls": [
"symptom_onset",
"symptom_severity",
"current_medications",
"allergies"
],
"chest_pain": [
"pain_quality",
"pain_location",
"radiation_check",
"associated_symptoms",
"exacerbating_factors",
"relieving_factors"
],
"respiratory": [
"breathing_difficulty_severity",
"cough_present",
"fever_check",
"recent_illness"
]
},
"context_field": "metadata.chief_complaint_category"
}
},
# Disposition Appropriateness
{
"type": "disposition_accuracy",
"weight": 0.15,
"config": {
"valid_dispositions": [
"call_911",
"er_immediate",
"urgent_care_today",
"schedule_pcp_48h",
"telehealth_appointment",
"self_care_guidance"
],
"must_match_triage": True,
"triage_disposition_mapping": {
"emergent": ["call_911", "er_immediate"],
"urgent": ["urgent_care_today", "er_immediate"],
"semi_urgent": ["schedule_pcp_48h", "telehealth_appointment"],
"routine": ["schedule_pcp_week", "telehealth_appointment"],
"self_care": ["self_care_guidance"]
}
}
}
]
}
Step 3: Run the Evaluation
run_evaluation.py
Copy
Ask AI
# Create evaluation with safety-focused human review
evaluation = client.evaluations.create(
name="Voice Triage v3.2 - Safety Evaluation",
project="patient-triage",
dataset=dataset.id,
model_version="triage-agent-v3.2",
**evaluation_config,
# Critical: route safety concerns to physicians
human_review={
"enabled": True,
"routing_rules": [
{
"name": "critical_safety",
"condition": "red_flag_score < 90 OR triage_accuracy < 70",
"reviewer_pool": "emergency_physician",
"priority": "urgent",
"sla_hours": 4
},
{
"name": "triage_disagreement",
"condition": "ai_triage != expected_triage",
"reviewer_pool": "nurse",
"priority": "normal",
"sla_hours": 24
},
{
"name": "quality_sample",
"condition": "RANDOM(0.1)", # 10% random sample
"reviewer_pool": "nurse",
"priority": "low"
}
]
},
# Audio processing settings
audio_processing={
"transcription": {
"provider": "deepgram", # or "whisper", "assemblyai"
"model": "nova-2-medical",
"diarization": True,
"punctuation": True
},
"quality_check": {
"min_audio_quality": 0.7,
"flag_low_quality": True
}
}
)
print(f"Started evaluation: {evaluation.id}")
# Monitor progress
import time
while evaluation.status != "completed":
evaluation.refresh()
print(f"Progress: {evaluation.progress.completed}/{evaluation.progress.total}")
time.sleep(30)
Step 4: Analyze Safety Metrics
safety_analysis.py
Copy
Ask AI
results = client.evaluations.get(evaluation.id)
print(f"""
Voice Triage Safety Analysis
=============================
CRITICAL METRICS:
Triage Accuracy: {results.scores.triage_accuracy:.1%}
Red Flag Detection: {results.scores.red_flag_detection:.1%}
SAFETY BREAKDOWN:
Under-triage Rate: {results.safety.under_triage_rate:.2%}
Missed Red Flags: {results.safety.missed_red_flags}
Critical Misses: {results.safety.critical_misses}
BY CONDITION:
""")
for condition in results.by_condition:
print(f"""
{condition.name}:
Sensitivity: {condition.sensitivity:.1%}
Specificity: {condition.specificity:.1%}
Missed Cases: {condition.false_negatives}
""")
# Check for deployment readiness
deployment_criteria = {
"triage_accuracy": 0.85,
"red_flag_sensitivity": 0.98,
"critical_misses": 0
}
can_deploy = (
results.scores.triage_accuracy >= deployment_criteria["triage_accuracy"] and
results.safety.red_flag_sensitivity >= deployment_criteria["red_flag_sensitivity"] and
results.safety.critical_misses <= deployment_criteria["critical_misses"]
)
if can_deploy:
print("✅ PASSED: Safe for deployment")
else:
print("❌ FAILED: Does not meet safety criteria")
print(" Issues:")
if results.scores.triage_accuracy < deployment_criteria["triage_accuracy"]:
print(f" - Triage accuracy below threshold")
if results.safety.red_flag_sensitivity < deployment_criteria["red_flag_sensitivity"]:
print(f" - Red flag detection below threshold")
if results.safety.critical_misses > deployment_criteria["critical_misses"]:
print(f" - {results.safety.critical_misses} critical misses detected")
Step 5: Review Flagged Cases
Use the Rubric dashboard or API to review cases routed for human expert review.Copy
Ask AI
# Get cases pending review
pending = client.reviews.list(
evaluation_id=evaluation.id,
status="pending"
)
print(f"{len(pending)} cases pending review")
# View a specific flagged case
case = client.reviews.get(pending[0].id)
print(f"""
Flagged Case: {case.sample_id}
Reason: {case.routing_reason}
AI Triage: {case.ai_output.triage_level}
Expected: {case.expected_output.triage_level}
Transcript Preview:
{case.input.transcript[:500]}...
Red Flags AI Detected: {case.ai_output.red_flags_detected}
Red Flags Expected: {case.expected_output.required_red_flags}
Listen to audio: {case.input.audio_url}
""")
Critical Case Review: Cases with missed red flags should be reviewed within the SLA (4 hours for critical). These represent potential patient safety issues that need immediate attention.
Step 6: Iterate and Improve
Copy
Ask AI
# Identify improvement opportunities
analysis = client.evaluations.analyze_failures(evaluation.id)
print("\nTop Issues to Address:")
for issue in analysis.top_issues[:5]:
print(f"""
Issue: {issue.description}
Frequency: {issue.count} cases ({issue.percentage:.1%})
Severity: {issue.severity}
Root Cause: {issue.likely_cause}
Example Case: {issue.example_sample_id}
Suggested Fix: {issue.suggested_action}
""")
# Export cases for model retraining
training_export = client.evaluations.export_for_training(
evaluation_id=evaluation.id,
# Include human-reviewed corrections
include_corrections=True,
# Focus on failure cases
filters={
"triage_correct": False,
"red_flag_missed": True
},
output_format="jsonl",
output_path="./training_data/triage_corrections.jsonl"
)
print(f"Exported {training_export.count} cases for retraining")
Production Monitoring
Once deployed, continue monitoring with production evaluations.Copy
Ask AI
# Set up continuous monitoring
monitor = client.monitors.create(
name="Voice Triage Production Monitor",
project="patient-triage",
# Sample 5% of production calls
sample_rate=0.05,
# Use same evaluation config
evaluation_config=evaluation_config,
# Alert thresholds
alerts=[
{
"metric": "triage_accuracy",
"condition": "drops_below",
"threshold": 0.80,
"window": "1_hour",
"notify": ["[email protected]"]
},
{
"metric": "red_flag_sensitivity",
"condition": "drops_below",
"threshold": 0.95,
"window": "30_minutes",
"notify": ["[email protected]", "[email protected]"],
"severity": "critical"
}
]
)
print(f"Monitor active: {monitor.id}")
