Data Validation & Testing
ML Testing with pytest
5 min read
English Content
The ML Testing Pyramid
ML testing extends traditional software testing with model-specific tests:
/\
/ \
/ E2E \ ← Model integration tests
/________\
/ \
/ Behavioral \ ← Invariance, directional
/______________\
/ \
/ Model Unit \ ← Single prediction tests
/____________________\
/ \
/ Data Validation \ ← Schema, statistics
/__________________________\
/ \
/ Code Unit Tests \ ← Functions, preprocessing
/________________________________\
Setting Up pytest for ML
# tests/conftest.py
import pytest
import pandas as pd
import numpy as np
import joblib
from pathlib import Path
@pytest.fixture(scope="session")
def trained_model():
"""Load trained model for testing."""
model_path = Path("models/production/model.joblib")
if model_path.exists():
return joblib.load(model_path)
pytest.skip("No trained model available")
@pytest.fixture
def sample_features():
"""Generate sample feature data."""
np.random.seed(42)
return pd.DataFrame({
"feature_1": np.random.normal(0, 1, 100),
"feature_2": np.random.normal(0, 1, 100),
"feature_3": np.random.uniform(0, 1, 100),
})
@pytest.fixture
def edge_case_features():
"""Generate edge case feature data."""
return pd.DataFrame({
"feature_1": [0.0, -999.0, 999.0, np.nan],
"feature_2": [0.0, 0.0, 0.0, 0.0],
"feature_3": [0.0, 1.0, 0.5, 0.5],
})
Unit Tests for ML Code
Test preprocessing and feature engineering:
# tests/test_preprocessing.py
import pytest
import pandas as pd
import numpy as np
from src.preprocessing import normalize_features, encode_categories, handle_missing
class TestNormalizeFeatures:
"""Tests for feature normalization."""
def test_output_range(self):
"""Normalized features should be in [-1, 1] or [0, 1]."""
df = pd.DataFrame({"value": [1, 2, 3, 4, 5]})
result = normalize_features(df, method="minmax")
assert result["value"].min() >= 0
assert result["value"].max() <= 1
def test_handles_constant_column(self):
"""Should handle columns with no variance."""
df = pd.DataFrame({"value": [5, 5, 5, 5, 5]})
result = normalize_features(df, method="zscore")
# Constant columns should become 0 after z-score
assert (result["value"] == 0).all()
def test_preserves_column_order(self):
"""Column order should be preserved."""
df = pd.DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6]})
result = normalize_features(df)
assert list(result.columns) == ["a", "b", "c"]
class TestHandleMissing:
"""Tests for missing value handling."""
def test_fills_with_median(self):
"""Missing values should be filled with median."""
df = pd.DataFrame({"value": [1, 2, np.nan, 4, 5]})
result = handle_missing(df, strategy="median")
assert not result["value"].isna().any()
assert result["value"].iloc[2] == 3 # Median of [1,2,4,5]
def test_drops_rows_above_threshold(self):
"""Rows with too many missing values should be dropped."""
df = pd.DataFrame({
"a": [1, np.nan, np.nan],
"b": [2, np.nan, 3],
"c": [3, np.nan, 4],
})
result = handle_missing(df, drop_threshold=0.5)
# Row 1 has 100% missing, should be dropped
assert len(result) == 2
Model Unit Tests
Test individual predictions:
# tests/test_model.py
import pytest
import numpy as np
class TestModelPredictions:
"""Tests for model prediction behavior."""
def test_output_shape(self, trained_model, sample_features):
"""Predictions should have correct shape."""
predictions = trained_model.predict(sample_features)
assert len(predictions) == len(sample_features)
def test_output_type(self, trained_model, sample_features):
"""Predictions should be numeric."""
predictions = trained_model.predict(sample_features)
assert np.issubdtype(predictions.dtype, np.number)
def test_probability_range(self, trained_model, sample_features):
"""Probabilities should be in [0, 1]."""
if hasattr(trained_model, "predict_proba"):
probas = trained_model.predict_proba(sample_features)
assert probas.min() >= 0
assert probas.max() <= 1
assert np.allclose(probas.sum(axis=1), 1.0)
def test_deterministic(self, trained_model, sample_features):
"""Same input should produce same output."""
pred1 = trained_model.predict(sample_features)
pred2 = trained_model.predict(sample_features)
np.testing.assert_array_equal(pred1, pred2)
Behavioral Tests
Test model behavior under transformations:
# tests/test_behavioral.py
import pytest
import numpy as np
import pandas as pd
class TestInvariance:
"""Tests for input invariance properties."""
def test_feature_order_invariance(self, trained_model):
"""Reordering features should not change predictions."""
df1 = pd.DataFrame({
"feature_1": [0.5],
"feature_2": [0.3],
"feature_3": [0.2],
})
df2 = df1[["feature_3", "feature_1", "feature_2"]]
# Note: This assumes model handles column reordering
# or we reorder before prediction
pred1 = trained_model.predict(df1)
pred2 = trained_model.predict(df2[df1.columns])
np.testing.assert_array_equal(pred1, pred2)
def test_scale_invariance(self, trained_model):
"""Small input changes should produce small output changes."""
base = pd.DataFrame({
"feature_1": [0.5],
"feature_2": [0.3],
"feature_3": [0.2],
})
perturbed = base + 0.001
pred_base = trained_model.predict(base)
pred_perturbed = trained_model.predict(perturbed)
# Predictions should be similar for similar inputs
assert abs(pred_base[0] - pred_perturbed[0]) < 0.1
class TestDirectional:
"""Tests for directional expectations."""
def test_higher_risk_features(self, trained_model):
"""Higher risk features should increase risk prediction."""
low_risk = pd.DataFrame({
"age": [25],
"debt_ratio": [0.1],
"income": [100000],
})
high_risk = pd.DataFrame({
"age": [25],
"debt_ratio": [0.9], # Higher debt ratio
"income": [100000],
})
pred_low = trained_model.predict_proba(low_risk)[0, 1]
pred_high = trained_model.predict_proba(high_risk)[0, 1]
assert pred_high > pred_low, "Higher debt should increase risk"
Pre-Training Validation
# tests/test_pretrain.py
import pytest
import pandas as pd
class TestPreTraining:
"""Validation before training starts."""
def test_sufficient_training_data(self, training_data):
"""Should have minimum amount of training data."""
min_samples = 1000
assert len(training_data) >= min_samples
def test_class_balance(self, training_data):
"""Classes should not be severely imbalanced."""
class_counts = training_data["label"].value_counts()
ratio = class_counts.min() / class_counts.max()
assert ratio >= 0.1, f"Class imbalance too severe: {ratio:.2f}"
def test_no_data_leakage(self, training_data, test_data):
"""Training and test sets should not overlap."""
train_ids = set(training_data["id"])
test_ids = set(test_data["id"])
overlap = train_ids & test_ids
assert len(overlap) == 0, f"Found {len(overlap)} overlapping IDs"
def test_feature_variance(self, training_data):
"""Features should have non-zero variance."""
numeric_cols = training_data.select_dtypes(include=[np.number]).columns
low_variance = []
for col in numeric_cols:
if training_data[col].std() < 1e-10:
low_variance.append(col)
assert len(low_variance) == 0, f"Zero variance columns: {low_variance}"
CI/CD Integration
# .github/workflows/ml-tests.yml
name: ML Tests
on: [push, pull_request]
jobs:
test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: Install dependencies
run: pip install pytest pytest-cov pandas numpy scikit-learn
- name: Run unit tests
run: pytest tests/test_preprocessing.py -v
- name: Run data validation tests
run: pytest tests/test_data_validation.py -v
- name: Run model tests
run: pytest tests/test_model.py tests/test_behavioral.py -v
- name: Generate coverage report
run: pytest --cov=src --cov-report=xml
- name: Upload coverage
uses: codecov/codecov-action@v4
Key Takeaways
| Test Type | What to Test | When to Run |
|---|---|---|
| Code unit | Preprocessing, feature engineering | Every commit |
| Data validation | Schema, statistics, business rules | Data changes |
| Model unit | Predictions, shapes, types | After training |
| Behavioral | Invariance, directional expectations | After training |
| Pre-training | Data quality, leakage, balance | Before training |
المحتوى العربي
هرم اختبار ML
اختبار ML يمتد الاختبار البرمجي التقليدي باختبارات خاصة بالنموذج:
/\
/ \
/ E2E \ ← اختبارات تكامل النموذج
/________\
/ \
/ Behavioral \ ← الثبات، الاتجاهي
/______________\
/ \
/ Model Unit \ ← اختبارات تنبؤ فردي
/____________________\
/ \
/ Data Validation \ ← المخطط، الإحصائيات
/__________________________\
/ \
/ Code Unit Tests \ ← الدوال، المعالجة المسبقة
/________________________________\
إعداد pytest لـ ML
# tests/conftest.py
import pytest
import pandas as pd
import numpy as np
import joblib
from pathlib import Path
@pytest.fixture(scope="session")
def trained_model():
"""تحميل النموذج المدرب للاختبار."""
model_path = Path("models/production/model.joblib")
if model_path.exists():
return joblib.load(model_path)
pytest.skip("No trained model available")
@pytest.fixture
def sample_features():
"""توليد بيانات ميزات عينة."""
np.random.seed(42)
return pd.DataFrame({
"feature_1": np.random.normal(0, 1, 100),
"feature_2": np.random.normal(0, 1, 100),
"feature_3": np.random.uniform(0, 1, 100),
})
@pytest.fixture
def edge_case_features():
"""توليد بيانات ميزات حالات حدية."""
return pd.DataFrame({
"feature_1": [0.0, -999.0, 999.0, np.nan],
"feature_2": [0.0, 0.0, 0.0, 0.0],
"feature_3": [0.0, 1.0, 0.5, 0.5],
})
اختبارات الوحدة لكود ML
اختبار المعالجة المسبقة وهندسة الميزات:
# tests/test_preprocessing.py
import pytest
import pandas as pd
import numpy as np
from src.preprocessing import normalize_features, encode_categories, handle_missing
class TestNormalizeFeatures:
"""اختبارات لتوحيد الميزات."""
def test_output_range(self):
"""الميزات الموحدة يجب أن تكون في [-1, 1] أو [0, 1]."""
df = pd.DataFrame({"value": [1, 2, 3, 4, 5]})
result = normalize_features(df, method="minmax")
assert result["value"].min() >= 0
assert result["value"].max() <= 1
def test_handles_constant_column(self):
"""يجب التعامل مع الأعمدة بدون تباين."""
df = pd.DataFrame({"value": [5, 5, 5, 5, 5]})
result = normalize_features(df, method="zscore")
# الأعمدة الثابتة يجب أن تصبح 0 بعد z-score
assert (result["value"] == 0).all()
def test_preserves_column_order(self):
"""ترتيب الأعمدة يجب أن يُحفظ."""
df = pd.DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6]})
result = normalize_features(df)
assert list(result.columns) == ["a", "b", "c"]
class TestHandleMissing:
"""اختبارات للتعامل مع القيم المفقودة."""
def test_fills_with_median(self):
"""القيم المفقودة يجب أن تُملأ بالوسيط."""
df = pd.DataFrame({"value": [1, 2, np.nan, 4, 5]})
result = handle_missing(df, strategy="median")
assert not result["value"].isna().any()
assert result["value"].iloc[2] == 3 # وسيط [1,2,4,5]
def test_drops_rows_above_threshold(self):
"""الصفوف بقيم مفقودة كثيرة يجب أن تُحذف."""
df = pd.DataFrame({
"a": [1, np.nan, np.nan],
"b": [2, np.nan, 3],
"c": [3, np.nan, 4],
})
result = handle_missing(df, drop_threshold=0.5)
# الصف 1 لديه 100% مفقود، يجب حذفه
assert len(result) == 2
اختبارات وحدة النموذج
اختبار التنبؤات الفردية:
# tests/test_model.py
import pytest
import numpy as np
class TestModelPredictions:
"""اختبارات لسلوك تنبؤ النموذج."""
def test_output_shape(self, trained_model, sample_features):
"""التنبؤات يجب أن يكون لها الشكل الصحيح."""
predictions = trained_model.predict(sample_features)
assert len(predictions) == len(sample_features)
def test_output_type(self, trained_model, sample_features):
"""التنبؤات يجب أن تكون رقمية."""
predictions = trained_model.predict(sample_features)
assert np.issubdtype(predictions.dtype, np.number)
def test_probability_range(self, trained_model, sample_features):
"""الاحتمالات يجب أن تكون في [0, 1]."""
if hasattr(trained_model, "predict_proba"):
probas = trained_model.predict_proba(sample_features)
assert probas.min() >= 0
assert probas.max() <= 1
assert np.allclose(probas.sum(axis=1), 1.0)
def test_deterministic(self, trained_model, sample_features):
"""نفس المدخل يجب أن ينتج نفس المخرج."""
pred1 = trained_model.predict(sample_features)
pred2 = trained_model.predict(sample_features)
np.testing.assert_array_equal(pred1, pred2)
الاختبارات السلوكية
اختبار سلوك النموذج تحت التحولات:
# tests/test_behavioral.py
import pytest
import numpy as np
import pandas as pd
class TestInvariance:
"""اختبارات لخصائص ثبات المدخلات."""
def test_feature_order_invariance(self, trained_model):
"""إعادة ترتيب الميزات يجب ألا يغير التنبؤات."""
df1 = pd.DataFrame({
"feature_1": [0.5],
"feature_2": [0.3],
"feature_3": [0.2],
})
df2 = df1[["feature_3", "feature_1", "feature_2"]]
pred1 = trained_model.predict(df1)
pred2 = trained_model.predict(df2[df1.columns])
np.testing.assert_array_equal(pred1, pred2)
def test_scale_invariance(self, trained_model):
"""تغييرات صغيرة في المدخلات يجب أن تنتج تغييرات صغيرة في المخرجات."""
base = pd.DataFrame({
"feature_1": [0.5],
"feature_2": [0.3],
"feature_3": [0.2],
})
perturbed = base + 0.001
pred_base = trained_model.predict(base)
pred_perturbed = trained_model.predict(perturbed)
# التنبؤات يجب أن تكون متشابهة للمدخلات المتشابهة
assert abs(pred_base[0] - pred_perturbed[0]) < 0.1
class TestDirectional:
"""اختبارات للتوقعات الاتجاهية."""
def test_higher_risk_features(self, trained_model):
"""ميزات المخاطر العالية يجب أن تزيد تنبؤ المخاطر."""
low_risk = pd.DataFrame({
"age": [25],
"debt_ratio": [0.1],
"income": [100000],
})
high_risk = pd.DataFrame({
"age": [25],
"debt_ratio": [0.9], # نسبة دين أعلى
"income": [100000],
})
pred_low = trained_model.predict_proba(low_risk)[0, 1]
pred_high = trained_model.predict_proba(high_risk)[0, 1]
assert pred_high > pred_low, "الدين الأعلى يجب أن يزيد المخاطر"
التحقق قبل التدريب
# tests/test_pretrain.py
import pytest
import pandas as pd
class TestPreTraining:
"""التحقق قبل بدء التدريب."""
def test_sufficient_training_data(self, training_data):
"""يجب أن يكون هناك كمية أدنى من بيانات التدريب."""
min_samples = 1000
assert len(training_data) >= min_samples
def test_class_balance(self, training_data):
"""الفئات يجب ألا تكون غير متوازنة بشدة."""
class_counts = training_data["label"].value_counts()
ratio = class_counts.min() / class_counts.max()
assert ratio >= 0.1, f"Class imbalance too severe: {ratio:.2f}"
def test_no_data_leakage(self, training_data, test_data):
"""مجموعات التدريب والاختبار يجب ألا تتداخل."""
train_ids = set(training_data["id"])
test_ids = set(test_data["id"])
overlap = train_ids & test_ids
assert len(overlap) == 0, f"Found {len(overlap)} overlapping IDs"
def test_feature_variance(self, training_data):
"""الميزات يجب أن يكون لها تباين غير صفري."""
numeric_cols = training_data.select_dtypes(include=[np.number]).columns
low_variance = []
for col in numeric_cols:
if training_data[col].std() < 1e-10:
low_variance.append(col)
assert len(low_variance) == 0, f"Zero variance columns: {low_variance}"
تكامل CI/CD
# .github/workflows/ml-tests.yml
name: ML Tests
on: [push, pull_request]
jobs:
test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: Install dependencies
run: pip install pytest pytest-cov pandas numpy scikit-learn
- name: Run unit tests
run: pytest tests/test_preprocessing.py -v
- name: Run data validation tests
run: pytest tests/test_data_validation.py -v
- name: Run model tests
run: pytest tests/test_model.py tests/test_behavioral.py -v
- name: Generate coverage report
run: pytest --cov=src --cov-report=xml
- name: Upload coverage
uses: codecov/codecov-action@v4
النقاط الرئيسية
| نوع الاختبار | ما يُختبر | متى يُشغّل |
|---|---|---|
| وحدة الكود | المعالجة المسبقة، هندسة الميزات | كل commit |
| التحقق من البيانات | المخطط، الإحصائيات، قواعد العمل | تغييرات البيانات |
| وحدة النموذج | التنبؤات، الأشكال، الأنواع | بعد التدريب |
| السلوكي | الثبات، التوقعات الاتجاهية | بعد التدريب |
| قبل التدريب | جودة البيانات، التسرب، التوازن | قبل التدريب |