CI/CD for ML Systems
ML Testing Strategies
5 min read
ML testing goes beyond unit tests. Interviewers expect you to know data validation, model validation, and integration testing specific to ML systems.
The ML Testing Pyramid
/\
/ \
/ E2E \ <-- Shadow testing, A/B tests
/________\
/ \
/ Integration \ <-- Pipeline tests, API tests
/______________\
/ \
/ Model Tests \ <-- Performance, fairness, robustness
/____________________\
/ \
/ Data Tests \ <-- Schema, distribution, quality
/__________________________\
/ \
/ Unit Tests \ <-- Feature engineering, preprocessing
/________________________________\
Interview Question: Testing Strategy
Question: "How would you set up testing for an ML pipeline that runs daily?"
Comprehensive Answer:
# Test categories for ML systems
ml_test_categories = {
"unit_tests": {
"purpose": "Test individual functions",
"examples": [
"Feature engineering transformations",
"Preprocessing logic",
"Custom loss functions"
],
"run_frequency": "Every commit"
},
"data_tests": {
"purpose": "Validate data quality",
"examples": [
"Schema validation",
"Distribution tests (vs baseline)",
"Null rate thresholds",
"Referential integrity"
],
"run_frequency": "Every pipeline run"
},
"model_tests": {
"purpose": "Validate model quality",
"examples": [
"Minimum accuracy threshold",
"Inference latency requirements",
"Fairness metrics (by demographic)",
"Robustness to perturbations"
],
"run_frequency": "Before deployment"
},
"integration_tests": {
"purpose": "Test component interactions",
"examples": [
"End-to-end pipeline execution",
"API contract tests",
"Feature store integration"
],
"run_frequency": "Daily / pre-merge"
}
}
Data Validation with Great Expectations
import great_expectations as gx
def create_data_expectations(context):
"""Define data quality expectations"""
suite = context.create_expectation_suite("training_data_suite")
# Schema expectations
validator.expect_column_to_exist("user_id")
validator.expect_column_to_exist("transaction_amount")
validator.expect_column_to_exist("timestamp")
# Type expectations
validator.expect_column_values_to_be_of_type(
"transaction_amount", "float64"
)
# Value range expectations
validator.expect_column_values_to_be_between(
"transaction_amount", min_value=0, max_value=1_000_000
)
# Null rate expectations
validator.expect_column_values_to_not_be_null(
"user_id", mostly=1.0
)
validator.expect_column_values_to_not_be_null(
"transaction_amount", mostly=0.99 # Allow 1% nulls
)
# Distribution expectations
validator.expect_column_mean_to_be_between(
"transaction_amount", min_value=50, max_value=500
)
# Uniqueness expectations
validator.expect_column_values_to_be_unique("transaction_id")
return suite
Model Performance Tests
import pytest
class TestModelPerformance:
"""Model must pass these tests before deployment"""
@pytest.fixture
def model(self):
return load_production_model("fraud_detector")
@pytest.fixture
def test_data(self):
return load_test_dataset("fraud_test_2024")
def test_accuracy_threshold(self, model, test_data):
"""Model accuracy must exceed 90%"""
predictions = model.predict(test_data.features)
accuracy = accuracy_score(test_data.labels, predictions)
assert accuracy >= 0.90, f"Accuracy {accuracy} below threshold 0.90"
def test_precision_threshold(self, model, test_data):
"""Precision must exceed 85% to minimize false positives"""
predictions = model.predict(test_data.features)
precision = precision_score(test_data.labels, predictions)
assert precision >= 0.85, f"Precision {precision} below threshold 0.85"
def test_inference_latency(self, model):
"""P99 latency must be under 100ms"""
latencies = []
for _ in range(1000):
start = time.time()
model.predict(sample_input)
latencies.append((time.time() - start) * 1000)
p99_latency = np.percentile(latencies, 99)
assert p99_latency < 100, f"P99 latency {p99_latency}ms exceeds 100ms"
def test_fairness_by_demographic(self, model, test_data):
"""Ensure no demographic has >10% accuracy disparity"""
results_by_group = {}
for group in test_data.demographic_groups:
group_data = test_data.filter(demographic=group)
predictions = model.predict(group_data.features)
results_by_group[group] = accuracy_score(group_data.labels, predictions)
max_disparity = max(results_by_group.values()) - min(results_by_group.values())
assert max_disparity <= 0.10, f"Fairness disparity {max_disparity} exceeds 10%"
Testing Interview Cheat Sheet
| Test Type | What It Catches | Tools |
|---|---|---|
| Unit | Logic bugs in code | pytest, unittest |
| Data | Schema changes, quality issues | Great Expectations, Pandera |
| Model | Performance regression | pytest, custom metrics |
| Integration | Component mismatches | pytest, API testing |
| Contract | API breaking changes | Pact, schemathesis |
Interview Signal: Mentioning fairness testing and demographic analysis shows maturity beyond basic ML testing.
Next, we'll cover GitHub Actions workflows for ML. :::