Skip to main content

When Human Review Matters

Not every AI output needs human review. The goal is to focus expert attention where it adds the most value—on cases where automated evaluation is uncertain or where clinical nuance is required.
ScenarioReview RecommendedRationale
Automated score < 70%YesLow confidence in automated assessment
Safety flag triggeredYesPotential patient harm requires expert review
Edge case detectionYesUnusual presentation needs clinical judgment
High confidence correctSample onlySpot-check to calibrate automation
Clear-cut failuresOptionalMay already have sufficient signal

Review Workflow Design

Routing Configuration

review_workflow.py
from rubric import Rubric

client = Rubric()

evaluation = client.evaluations.create(
    name="Triage Evaluation with Review",
    dataset="ds_production_calls",
    evaluators=[...],

    human_review={
        "enabled": True,

        # Routing rules (evaluated in order)
        "routing_rules": [
            {
                "name": "safety_critical",
                "condition": "safety_score < 80 OR missed_red_flags > 0",
                "reviewer_pool": "physician",
                "priority": "urgent",
                "sla_hours": 4
            },
            {
                "name": "uncertain_triage",
                "condition": "triage_confidence < 0.7",
                "reviewer_pool": "nurse",
                "priority": "normal",
                "sla_hours": 24
            },
            {
                "name": "quality_sample",
                "condition": "RANDOM(0.05)",  # 5% random sample
                "reviewer_pool": "any_clinical",
                "priority": "low",
                "sla_hours": 72
            }
        ],

        # Default for cases not matching rules
        "default_action": "auto_approve",

        # Review interface configuration
        "interface": {
            "show_ai_reasoning": True,
            "show_confidence_scores": True,
            "require_justification": True,
            "allow_partial_review": False
        }
    }
)

Review Task Structure

Each review task should be focused and actionable. Structure tasks to minimize cognitive load while capturing the information you need.
review_task_design.py
review_schema = {
    "task_type": "triage_validation",
    "version": "2.0",

    "sections": [
        {
            "name": "triage_assessment",
            "title": "Triage Decision Review",
            "questions": [
                {
                    "id": "triage_correct",
                    "type": "single_choice",
                    "question": "Was the AI's triage level appropriate?",
                    "options": [
                        {"value": "correct", "label": "Correct triage"},
                        {"value": "under", "label": "Under-triaged (should be more urgent)", "flag": "safety"},
                        {"value": "over", "label": "Over-triaged (could be less urgent)"},
                        {"value": "unclear", "label": "Insufficient information to determine"}
                    ],
                    "required": True
                },
                {
                    "id": "correct_level",
                    "type": "single_choice",
                    "question": "What should the triage level be?",
                    "condition": "triage_correct != 'correct'",
                    "options": [
                        {"value": "emergent", "label": "Emergent (immediate)"},
                        {"value": "urgent", "label": "Urgent (same day)"},
                        {"value": "semi_urgent", "label": "Semi-urgent (24-48 hrs)"},
                        {"value": "routine", "label": "Routine"},
                        {"value": "self_care", "label": "Self-care advice"}
                    ],
                    "required": True
                }
            ]
        },
        {
            "name": "safety_check",
            "title": "Patient Safety",
            "questions": [
                {
                    "id": "red_flags_addressed",
                    "type": "single_choice",
                    "question": "Were all red flags properly addressed?",
                    "options": [
                        {"value": "all", "label": "All red flags addressed"},
                        {"value": "partial", "label": "Some red flags missed"},
                        {"value": "critical_missed", "label": "Critical red flags missed", "flag": "critical"}
                    ],
                    "required": True
                },
                {
                    "id": "missed_flags",
                    "type": "multi_select",
                    "question": "Which red flags were missed?",
                    "condition": "red_flags_addressed != 'all'",
                    "options": "dynamic:red_flag_options",
                    "required": True
                }
            ]
        },
        {
            "name": "notes",
            "title": "Additional Notes",
            "questions": [
                {
                    "id": "justification",
                    "type": "text",
                    "question": "Please explain your assessment",
                    "condition": "triage_correct != 'correct' OR red_flags_addressed != 'all'",
                    "min_length": 50,
                    "required": True
                },
                {
                    "id": "training_value",
                    "type": "boolean",
                    "question": "Should this case be used for model training?",
                    "required": False
                }
            ]
        }
    ],

    # Time expectations
    "estimated_duration_seconds": 120,
    "max_duration_seconds": 600
}

Consensus & Disagreement Handling

For high-stakes decisions, multiple reviewers can assess the same case. Rubric provides mechanisms for handling agreement and resolving disputes.
consensus_config.py
human_review = {
    "consensus": {
        "enabled": True,

        # How many reviewers per case
        "reviewers_per_case": {
            "default": 1,
            "safety_critical": 2,
            "training_data": 3
        },

        # Agreement requirements
        "agreement_threshold": 0.8,  # 80% agreement on key questions

        # Disagreement resolution
        "disagreement_handling": {
            "method": "escalate",  # Options: escalate, majority_vote, senior_review
            "escalation_pool": "senior_physician",
            "max_rounds": 2
        },

        # Inter-rater reliability tracking
        "track_reliability": True,
        "reliability_metrics": ["cohens_kappa", "fleiss_kappa", "percent_agreement"]
    }
}

Disagreement Patterns

PatternDetectionResolution Strategy
Binary disagreementReviewers split on critical questionEscalate to senior reviewer
Severity disagreementAgreement on direction, not magnitudeUse average or conservative estimate
Systematic biasOne reviewer consistently differsCalibration session, potential removal
Ambiguous caseHigh disagreement across multiple reviewersFlag for guideline clarification
Learning from Disagreement: Disagreements are valuable data. They often indicate ambiguous cases that should inform guideline updates, model training, or reviewer calibration.

Reviewer Experience

Well-designed review interfaces improve accuracy and reduce reviewer fatigue.

Interface Best Practices

PrincipleImplementation
Context firstShow patient presentation before AI output
Minimize scrollingKey information visible without scrolling
Clear audio controlsEasy playback with speed control for voice calls
Keyboard shortcuts1/2/3 for common choices, space for play/pause
Progress visibilityShow queue position and completion stats
Fatigue preventionEnforce breaks after extended sessions
interface_config.py
interface_config = {
    "layout": "split_view",  # transcript left, grading right

    "audio_player": {
        "enabled": True,
        "speeds": [0.75, 1.0, 1.25, 1.5, 2.0],
        "default_speed": 1.25,
        "auto_scroll_transcript": True
    },

    "keyboard_shortcuts": {
        "1": "select_option_1",
        "2": "select_option_2",
        "3": "select_option_3",
        "space": "toggle_playback",
        "left": "rewind_5s",
        "right": "forward_5s",
        "enter": "submit_review",
        "s": "skip_case"
    },

    "fatigue_management": {
        "max_continuous_reviews": 25,
        "required_break_minutes": 5,
        "session_time_limit_hours": 4,
        "accuracy_monitoring": True,  # Flag if accuracy drops
        "accuracy_threshold": 0.85
    },

    "feedback": {
        "show_ai_reasoning": True,
        "show_confidence": True,
        "show_similar_cases": False,  # Avoid anchoring bias
        "post_review_feedback": True  # Learn from disagreements
    }
}

Quality Assurance

Monitor reviewer performance and maintain calibration over time.
qa_monitoring.py
from rubric import Rubric

client = Rubric()

# Get reviewer performance metrics
qa_report = client.reviews.quality_report(
    evaluation_id="eval_abc123",
    metrics=[
        "agreement_rate",
        "review_time_distribution",
        "disagreement_patterns",
        "calibration_drift"
    ]
)

for reviewer in qa_report.reviewers:
    print(f"""
Reviewer: {reviewer.id} ({reviewer.credential})
  Reviews Completed: {reviewer.total_reviews}
  Agreement Rate: {reviewer.agreement_rate:.1%}
  Avg Review Time: {reviewer.avg_time_seconds:.0f}s
  Flags:
    {reviewer.quality_flags}
""")

# Identify calibration issues
if qa_report.calibration_drift > 0.1:
    print("⚠️ Calibration drift detected - schedule reviewer sync")

# Check for outliers
for outlier in qa_report.outlier_reviewers:
    print(f"⚠️ Outlier: {outlier.id} - {outlier.reason}")

Calibration Sessions

Regular calibration sessions ensure reviewers maintain consistent standards.
# Create calibration exercise
calibration = client.calibration.create(
    name="Monthly Triage Calibration",
    reviewer_pool="all_nurses",

    # Use known gold-standard cases
    cases=[
        {"id": "gold_1", "expected": {"triage": "emergent"}},
        {"id": "gold_2", "expected": {"triage": "urgent"}},
        {"id": "gold_3", "expected": {"triage": "routine"}},
        # Include edge cases
        {"id": "edge_1", "expected": {"triage": "urgent"}, "notes": "Subtle MI presentation"},
        {"id": "edge_2", "expected": {"triage": "semi_urgent"}, "notes": "Anxiety vs cardiac"}
    ],

    # Require minimum score to continue reviewing
    passing_score=0.85,

    # Provide feedback after submission
    show_explanations=True
)

Audit Trail

Every review decision is logged with full provenance for regulatory compliance.
# Get complete audit trail for a reviewed case
audit = client.reviews.audit_trail(sample_id="sample_xyz789")

print(f"""
Sample: {audit.sample_id}
AI Decision: {audit.ai_output}
Review Timeline:
""")

for event in audit.events:
    print(f"""
  {event.timestamp} - {event.type}
    Reviewer: {event.reviewer_id} ({event.credential})
    Decision: {event.decision}
    Confidence: {event.confidence}
    Time Spent: {event.duration_seconds}s
    Justification: {event.justification}
""")