Skip to main content

What You’ll Build

In this tutorial, you’ll set up a complete evaluation pipeline for a clinical NLP model that generates SOAP notes from patient encounter transcripts. By the end, you’ll have:
  • A dataset of clinical encounters with ground truth annotations
  • Automated evaluators for accuracy, completeness, and hallucination detection
  • Human review workflow for borderline cases
  • Metrics dashboard to track model performance
Estimated TimePrerequisitesDifficulty
30 minutesPython 3.8+, Rubric API keyIntermediate

Step 1: Install and Configure

pip install rubric
setup.py
from rubric import Rubric

# Initialize client
client = Rubric(api_key="rb_live_xxxxxxxx")

# Verify connection
print(client.health_check())  # Should print "ok"

Step 2: Prepare Your Dataset

First, create a dataset with clinical encounters and their expected outputs.
create_dataset.py
# Create a new dataset for clinical note evaluation
dataset = client.datasets.create(
    name="SOAP Note Evaluation - Q1 2025",
    description="Evaluation dataset for clinical note generation model",
    modality="clinical_notes",

    # Define the schema for your samples
    schema={
        "input": {
            "transcript": "string",  # Raw encounter transcript
            "patient_demographics": "object",
            "chief_complaint": "string"
        },
        "expected_output": {
            "subjective": "string",
            "objective": "string",
            "assessment": "string",
            "plan": "string",
            "icd_codes": "array"
        },
        "metadata": {
            "encounter_type": "string",
            "specialty": "string",
            "complexity": "string"
        }
    }
)

print(f"Created dataset: {dataset.id}")

Add Samples to Your Dataset

add_samples.py
# Add samples with ground truth annotations
samples = [
    {
        "input": {
            "transcript": """
            Doctor: Good morning, what brings you in today?
            Patient: I've been having this persistent cough for about two weeks now.
            Doctor: Is it a dry cough or are you bringing up any mucus?
            Patient: It's mostly dry, but sometimes I cough up a little clear stuff.
            Doctor: Any fever, shortness of breath, or chest pain?
            Patient: No fever, but I do feel a bit short of breath when climbing stairs.
            Doctor: Any history of asthma or allergies?
            Patient: I have seasonal allergies, take Zyrtec.
            Doctor: Current smoker?
            Patient: No, never smoked.
            """,
            "patient_demographics": {
                "age": 42,
                "sex": "female"
            },
            "chief_complaint": "Persistent cough x 2 weeks"
        },
        "expected_output": {
            "subjective": "42-year-old female presents with 2-week history of persistent, predominantly dry cough. Occasional clear sputum production. Reports dyspnea on exertion (climbing stairs). Denies fever or chest pain. PMH: seasonal allergies on cetirizine. Never smoker.",
            "objective": "[To be completed by examiner]",
            "assessment": "1. Acute bronchitis, likely viral\n2. Post-nasal drip syndrome (possible contributing factor given allergy history)\n3. Rule out reactive airway disease",
            "plan": "1. Supportive care with increased fluids\n2. OTC dextromethorphan for cough suppression PRN\n3. Continue cetirizine\n4. Return if symptoms worsen or persist >3 weeks\n5. Consider PFTs if cough persists to evaluate for asthma",
            "icd_codes": ["J20.9", "R05.9"]
        },
        "metadata": {
            "encounter_type": "outpatient",
            "specialty": "primary_care",
            "complexity": "moderate"
        }
    },
    # Add more samples...
]

# Batch upload samples
result = client.samples.create_batch(
    dataset=dataset.id,
    samples=samples
)

print(f"Added {result.created_count} samples")
Sample Size Recommendations: For reliable metrics, aim for at least 100 samples per specialty/complexity combination. Include edge cases and known failure modes from production.

Step 3: Configure Evaluators

Set up evaluators that will automatically assess your model’s outputs.
configure_evaluators.py
# Define evaluation configuration
evaluation_config = {
    "evaluators": [
        # Clinical Accuracy - Does the note capture correct medical facts?
        {
            "type": "clinical_accuracy",
            "weight": 0.30,
            "config": {
                "check_symptoms": True,
                "check_medications": True,
                "check_diagnoses": True,
                "check_procedures": True,
                "terminology_strictness": "moderate"
            }
        },

        # Completeness - Are all required elements present?
        {
            "type": "completeness",
            "weight": 0.25,
            "config": {
                "required_sections": ["subjective", "assessment", "plan"],
                "required_elements": {
                    "subjective": ["chief_complaint", "hpi", "pmh", "medications"],
                    "assessment": ["primary_diagnosis", "differential"],
                    "plan": ["treatment", "follow_up"]
                },
                "penalize_missing": True
            }
        },

        # Hallucination Detection - Is everything grounded in the transcript?
        {
            "type": "hallucination_detection",
            "weight": 0.25,
            "config": {
                "grounding_source": "input.transcript",
                "check_categories": [
                    "medications_not_mentioned",
                    "symptoms_not_reported",
                    "fabricated_history",
                    "invented_findings"
                ],
                "severity_levels": {
                    "critical": ["wrong_medication", "fabricated_allergy"],
                    "high": ["fabricated_symptom", "wrong_duration"],
                    "medium": ["exaggerated_severity"]
                }
            }
        },

        # ICD-10 Accuracy - Are the codes correct?
        {
            "type": "coding_accuracy",
            "weight": 0.20,
            "config": {
                "code_system": "icd10",
                "match_level": "category",  # Match at 3-digit level
                "check_specificity": True,
                "check_primary_secondary_order": True
            }
        }
    ]
}

Step 4: Run Your Model and Collect Outputs

Generate outputs from your model for each sample in the dataset.
generate_outputs.py
import your_model  # Your clinical NLP model

# Get samples from dataset
samples = client.samples.list(dataset=dataset.id)

# Generate model outputs
for sample in samples:
    # Call your model
    model_output = your_model.generate_soap_note(
        transcript=sample.input["transcript"],
        demographics=sample.input["patient_demographics"],
        chief_complaint=sample.input["chief_complaint"]
    )

    # Log the output to Rubric
    client.samples.add_output(
        sample_id=sample.id,
        model_version="soap-generator-v2.1",
        output={
            "subjective": model_output.subjective,
            "objective": model_output.objective,
            "assessment": model_output.assessment,
            "plan": model_output.plan,
            "icd_codes": model_output.codes
        },
        metadata={
            "latency_ms": model_output.latency,
            "tokens_used": model_output.token_count
        }
    )

print(f"Generated outputs for {len(samples)} samples")

Step 5: Create and Run the Evaluation

run_evaluation.py
# Create the evaluation
evaluation = client.evaluations.create(
    name="SOAP Generator v2.1 Evaluation",
    project="clinical-notes",
    dataset=dataset.id,
    model_version="soap-generator-v2.1",

    # Use our configured evaluators
    **evaluation_config,

    # Configure human review for uncertain cases
    human_review={
        "enabled": True,
        "routing_rules": [
            {
                "condition": "hallucination_score < 80",
                "reviewer_pool": "physician",
                "priority": "high"
            },
            {
                "condition": "completeness_score < 70",
                "reviewer_pool": "nurse",
                "priority": "normal"
            }
        ],
        "sample_rate": 0.1  # Also review 10% random sample
    }
)

print(f"Started evaluation: {evaluation.id}")

# Wait for automated evaluation to complete
evaluation.wait(stage="automated")

print(f"Automated evaluation complete")
print(f"  Samples evaluated: {evaluation.progress.completed}")
print(f"  Routed for review: {evaluation.progress.pending_review}")

Step 6: Review Results

view_results.py
# Get evaluation results
results = client.evaluations.get(evaluation.id)

print(f"""
Evaluation Results: {results.name}
=====================================

Overall Score: {results.composite_score:.1%}

By Evaluator:
  Clinical Accuracy:     {results.scores.clinical_accuracy:.1%}
  Completeness:          {results.scores.completeness:.1%}
  Hallucination-Free:    {results.scores.hallucination_detection:.1%}
  Coding Accuracy:       {results.scores.coding_accuracy:.1%}

Safety Metrics:
  Critical Hallucinations: {results.safety.critical_hallucination_count}
  Missed Required Fields:  {results.safety.missing_required_count}

Human Review Status:
  Completed: {results.human_review.completed}
  Pending:   {results.human_review.pending}
  Avg Agreement: {results.human_review.agreement_rate:.1%}
""")

# View samples with issues
print("\nSamples with Critical Issues:")
for sample in results.flagged_samples[:5]:
    print(f"  {sample.id}: {sample.primary_issue}")

Analyze Failure Patterns

# Get detailed failure analysis
analysis = client.evaluations.analyze_failures(evaluation.id)

print("\nTop Failure Patterns:")
for pattern in analysis.patterns[:5]:
    print(f"""
  Pattern: {pattern.description}
  Frequency: {pattern.count} ({pattern.percentage:.1%} of failures)
  Severity: {pattern.severity}
  Example: {pattern.example_sample_id}
  Suggested Fix: {pattern.suggested_action}
""")

Step 7: Export for Compliance

Generate reports suitable for regulatory documentation.
# Export detailed report
client.evaluations.export(
    evaluation.id,
    format="regulatory_report",
    output_path="./reports/soap_evaluation_report.pdf",
    include=[
        "executive_summary",
        "methodology",
        "dataset_description",
        "results_by_evaluator",
        "failure_analysis",
        "human_review_summary",
        "statistical_analysis",
        "appendix_sample_details"
    ]
)

print("Report exported to ./reports/soap_evaluation_report.pdf")

Next Steps