What You’ll Build
In this tutorial, you’ll set up a complete evaluation pipeline for a clinical NLP model that generates SOAP notes from patient encounter transcripts. By the end, you’ll have:- A dataset of clinical encounters with ground truth annotations
- Automated evaluators for accuracy, completeness, and hallucination detection
- Human review workflow for borderline cases
- Metrics dashboard to track model performance
| Estimated Time | Prerequisites | Difficulty |
|---|---|---|
| 30 minutes | Python 3.8+, Rubric API key | Intermediate |
Step 1: Install and Configure
Copy
Ask AI
pip install rubric
setup.py
Copy
Ask AI
from rubric import Rubric
# Initialize client
client = Rubric(api_key="rb_live_xxxxxxxx")
# Verify connection
print(client.health_check()) # Should print "ok"
Step 2: Prepare Your Dataset
First, create a dataset with clinical encounters and their expected outputs.create_dataset.py
Copy
Ask AI
# Create a new dataset for clinical note evaluation
dataset = client.datasets.create(
name="SOAP Note Evaluation - Q1 2025",
description="Evaluation dataset for clinical note generation model",
modality="clinical_notes",
# Define the schema for your samples
schema={
"input": {
"transcript": "string", # Raw encounter transcript
"patient_demographics": "object",
"chief_complaint": "string"
},
"expected_output": {
"subjective": "string",
"objective": "string",
"assessment": "string",
"plan": "string",
"icd_codes": "array"
},
"metadata": {
"encounter_type": "string",
"specialty": "string",
"complexity": "string"
}
}
)
print(f"Created dataset: {dataset.id}")
Add Samples to Your Dataset
add_samples.py
Copy
Ask AI
# Add samples with ground truth annotations
samples = [
{
"input": {
"transcript": """
Doctor: Good morning, what brings you in today?
Patient: I've been having this persistent cough for about two weeks now.
Doctor: Is it a dry cough or are you bringing up any mucus?
Patient: It's mostly dry, but sometimes I cough up a little clear stuff.
Doctor: Any fever, shortness of breath, or chest pain?
Patient: No fever, but I do feel a bit short of breath when climbing stairs.
Doctor: Any history of asthma or allergies?
Patient: I have seasonal allergies, take Zyrtec.
Doctor: Current smoker?
Patient: No, never smoked.
""",
"patient_demographics": {
"age": 42,
"sex": "female"
},
"chief_complaint": "Persistent cough x 2 weeks"
},
"expected_output": {
"subjective": "42-year-old female presents with 2-week history of persistent, predominantly dry cough. Occasional clear sputum production. Reports dyspnea on exertion (climbing stairs). Denies fever or chest pain. PMH: seasonal allergies on cetirizine. Never smoker.",
"objective": "[To be completed by examiner]",
"assessment": "1. Acute bronchitis, likely viral\n2. Post-nasal drip syndrome (possible contributing factor given allergy history)\n3. Rule out reactive airway disease",
"plan": "1. Supportive care with increased fluids\n2. OTC dextromethorphan for cough suppression PRN\n3. Continue cetirizine\n4. Return if symptoms worsen or persist >3 weeks\n5. Consider PFTs if cough persists to evaluate for asthma",
"icd_codes": ["J20.9", "R05.9"]
},
"metadata": {
"encounter_type": "outpatient",
"specialty": "primary_care",
"complexity": "moderate"
}
},
# Add more samples...
]
# Batch upload samples
result = client.samples.create_batch(
dataset=dataset.id,
samples=samples
)
print(f"Added {result.created_count} samples")
Sample Size Recommendations: For reliable metrics, aim for at least 100 samples per specialty/complexity combination. Include edge cases and known failure modes from production.
Step 3: Configure Evaluators
Set up evaluators that will automatically assess your model’s outputs.configure_evaluators.py
Copy
Ask AI
# Define evaluation configuration
evaluation_config = {
"evaluators": [
# Clinical Accuracy - Does the note capture correct medical facts?
{
"type": "clinical_accuracy",
"weight": 0.30,
"config": {
"check_symptoms": True,
"check_medications": True,
"check_diagnoses": True,
"check_procedures": True,
"terminology_strictness": "moderate"
}
},
# Completeness - Are all required elements present?
{
"type": "completeness",
"weight": 0.25,
"config": {
"required_sections": ["subjective", "assessment", "plan"],
"required_elements": {
"subjective": ["chief_complaint", "hpi", "pmh", "medications"],
"assessment": ["primary_diagnosis", "differential"],
"plan": ["treatment", "follow_up"]
},
"penalize_missing": True
}
},
# Hallucination Detection - Is everything grounded in the transcript?
{
"type": "hallucination_detection",
"weight": 0.25,
"config": {
"grounding_source": "input.transcript",
"check_categories": [
"medications_not_mentioned",
"symptoms_not_reported",
"fabricated_history",
"invented_findings"
],
"severity_levels": {
"critical": ["wrong_medication", "fabricated_allergy"],
"high": ["fabricated_symptom", "wrong_duration"],
"medium": ["exaggerated_severity"]
}
}
},
# ICD-10 Accuracy - Are the codes correct?
{
"type": "coding_accuracy",
"weight": 0.20,
"config": {
"code_system": "icd10",
"match_level": "category", # Match at 3-digit level
"check_specificity": True,
"check_primary_secondary_order": True
}
}
]
}
Step 4: Run Your Model and Collect Outputs
Generate outputs from your model for each sample in the dataset.generate_outputs.py
Copy
Ask AI
import your_model # Your clinical NLP model
# Get samples from dataset
samples = client.samples.list(dataset=dataset.id)
# Generate model outputs
for sample in samples:
# Call your model
model_output = your_model.generate_soap_note(
transcript=sample.input["transcript"],
demographics=sample.input["patient_demographics"],
chief_complaint=sample.input["chief_complaint"]
)
# Log the output to Rubric
client.samples.add_output(
sample_id=sample.id,
model_version="soap-generator-v2.1",
output={
"subjective": model_output.subjective,
"objective": model_output.objective,
"assessment": model_output.assessment,
"plan": model_output.plan,
"icd_codes": model_output.codes
},
metadata={
"latency_ms": model_output.latency,
"tokens_used": model_output.token_count
}
)
print(f"Generated outputs for {len(samples)} samples")
Step 5: Create and Run the Evaluation
run_evaluation.py
Copy
Ask AI
# Create the evaluation
evaluation = client.evaluations.create(
name="SOAP Generator v2.1 Evaluation",
project="clinical-notes",
dataset=dataset.id,
model_version="soap-generator-v2.1",
# Use our configured evaluators
**evaluation_config,
# Configure human review for uncertain cases
human_review={
"enabled": True,
"routing_rules": [
{
"condition": "hallucination_score < 80",
"reviewer_pool": "physician",
"priority": "high"
},
{
"condition": "completeness_score < 70",
"reviewer_pool": "nurse",
"priority": "normal"
}
],
"sample_rate": 0.1 # Also review 10% random sample
}
)
print(f"Started evaluation: {evaluation.id}")
# Wait for automated evaluation to complete
evaluation.wait(stage="automated")
print(f"Automated evaluation complete")
print(f" Samples evaluated: {evaluation.progress.completed}")
print(f" Routed for review: {evaluation.progress.pending_review}")
Step 6: Review Results
view_results.py
Copy
Ask AI
# Get evaluation results
results = client.evaluations.get(evaluation.id)
print(f"""
Evaluation Results: {results.name}
=====================================
Overall Score: {results.composite_score:.1%}
By Evaluator:
Clinical Accuracy: {results.scores.clinical_accuracy:.1%}
Completeness: {results.scores.completeness:.1%}
Hallucination-Free: {results.scores.hallucination_detection:.1%}
Coding Accuracy: {results.scores.coding_accuracy:.1%}
Safety Metrics:
Critical Hallucinations: {results.safety.critical_hallucination_count}
Missed Required Fields: {results.safety.missing_required_count}
Human Review Status:
Completed: {results.human_review.completed}
Pending: {results.human_review.pending}
Avg Agreement: {results.human_review.agreement_rate:.1%}
""")
# View samples with issues
print("\nSamples with Critical Issues:")
for sample in results.flagged_samples[:5]:
print(f" {sample.id}: {sample.primary_issue}")
Analyze Failure Patterns
Copy
Ask AI
# Get detailed failure analysis
analysis = client.evaluations.analyze_failures(evaluation.id)
print("\nTop Failure Patterns:")
for pattern in analysis.patterns[:5]:
print(f"""
Pattern: {pattern.description}
Frequency: {pattern.count} ({pattern.percentage:.1%} of failures)
Severity: {pattern.severity}
Example: {pattern.example_sample_id}
Suggested Fix: {pattern.suggested_action}
""")
Step 7: Export for Compliance
Generate reports suitable for regulatory documentation.Copy
Ask AI
# Export detailed report
client.evaluations.export(
evaluation.id,
format="regulatory_report",
output_path="./reports/soap_evaluation_report.pdf",
include=[
"executive_summary",
"methodology",
"dataset_description",
"results_by_evaluator",
"failure_analysis",
"human_review_summary",
"statistical_analysis",
"appendix_sample_details"
]
)
print("Report exported to ./reports/soap_evaluation_report.pdf")
