العمليات الإنتاجية وGitOps
CI/CD لنشر نماذج ML
3 دقيقة للقراءة
تمتد خطوط CI/CD الخاصة بـ ML لتتجاوز تسليم البرامج التقليدي مع التحقق من النموذج واختبار الأداء والنشر الكناري الآلي. يغطي هذا الدرس GitHub Actions وخطوط Tekton لسير عمل ML.
بنية خط CI/CD لـ ML
┌─────────────────────────────────────────────────────────────────────┐
│ خط CI/CD لـ ML │
├─────────────────────────────────────────────────────────────────────┤
│ │
│ ┌──────────┐ ┌──────────┐ ┌──────────┐ ┌──────────┐ │
│ │ دفع │──→│ بناء │──→│ اختبار │──→│ فحص │ │
│ │ الكود │ │ الصورة │ │ النموذج │ │ الأمان │ │
│ └──────────┘ └──────────┘ └──────────┘ └──────────┘ │
│ │ │
│ ↓ │
│ ┌──────────┐ ┌──────────┐ ┌──────────┐ ┌──────────┐ │
│ │ نشر │←──│ نشر │←──│ اختبار │←──│ دفع │ │
│ │ الإنتاج │ │ الكناري │ │ المرحلة │ │ السجل │ │
│ └──────────┘ └──────────┘ └──────────┘ └──────────┘ │
│ │ │ │
│ └──────────────┼───────────────────────────────────→ │
│ │ المراقبة والتراجع │
│ ↓ │
│ ┌─────────────────────────────────────────────────────────────┐ │
│ │ المراقبة (المقاييس، السجلات، التتبعات) │ │
│ └─────────────────────────────────────────────────────────────┘ │
│ │
└─────────────────────────────────────────────────────────────────────┘
GitHub Actions لـ ML
# .github/workflows/ml-deploy.yaml
name: نشر نموذج ML
on:
push:
branches: [main]
paths:
- 'models/**'
- 'inference/**'
pull_request:
branches: [main]
env:
REGISTRY: gcr.io
PROJECT_ID: ml-production
CLUSTER_NAME: ml-cluster
CLUSTER_ZONE: us-central1-a
jobs:
test-model:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: إعداد Python
uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: تثبيت التبعيات
run: |
pip install -r requirements.txt
pip install pytest pytest-benchmark
- name: تشغيل اختبارات وحدة النموذج
run: pytest tests/unit/ -v
- name: تشغيل اختبارات أداء النموذج
run: |
pytest tests/performance/ --benchmark-json=benchmark.json
- name: فحص تراجع الأداء
run: |
python scripts/check_performance.py benchmark.json \
--baseline benchmarks/baseline.json \
--threshold 0.1 # أقصى تراجع 10%
build-and-push:
needs: test-model
runs-on: ubuntu-latest
outputs:
image-tag: ${{ steps.meta.outputs.tags }}
steps:
- uses: actions/checkout@v4
- name: إعداد Docker Buildx
uses: docker/setup-buildx-action@v3
- name: تسجيل الدخول إلى GCR
uses: docker/login-action@v3
with:
registry: gcr.io
username: _json_key
password: ${{ secrets.GCP_SA_KEY }}
- name: استخراج البيانات الوصفية
id: meta
uses: docker/metadata-action@v5
with:
images: ${{ env.REGISTRY }}/${{ env.PROJECT_ID }}/inference
tags: |
type=sha,prefix=
type=ref,event=branch
- name: البناء والدفع
uses: docker/build-push-action@v5
with:
context: .
push: true
tags: ${{ steps.meta.outputs.tags }}
cache-from: type=gha
cache-to: type=gha,mode=max
security-scan:
needs: build-and-push
runs-on: ubuntu-latest
steps:
- name: فحص الصورة للثغرات
uses: aquasecurity/trivy-action@master
with:
image-ref: ${{ needs.build-and-push.outputs.image-tag }}
format: 'sarif'
output: 'trivy-results.sarif'
severity: 'HIGH,CRITICAL'
- name: رفع نتائج الفحص
uses: github/codeql-action/upload-sarif@v3
with:
sarif_file: 'trivy-results.sarif'
deploy-staging:
needs: [build-and-push, security-scan]
runs-on: ubuntu-latest
environment: staging
steps:
- uses: actions/checkout@v4
- name: إعداد gcloud
uses: google-github-actions/setup-gcloud@v2
with:
service_account_key: ${{ secrets.GCP_SA_KEY }}
- name: الحصول على بيانات اعتماد GKE
run: |
gcloud container clusters get-credentials ${{ env.CLUSTER_NAME }} \
--zone ${{ env.CLUSTER_ZONE }}
- name: النشر إلى المرحلة
run: |
kubectl set image deployment/inference-staging \
inference=${{ needs.build-and-push.outputs.image-tag }} \
-n ml-staging
- name: انتظار الإطلاق
run: |
kubectl rollout status deployment/inference-staging \
-n ml-staging --timeout=300s
- name: تشغيل اختبارات التكامل
run: |
python scripts/integration_tests.py \
--endpoint https://staging.inference.example.com \
--test-data tests/fixtures/integration.json
deploy-canary:
needs: deploy-staging
runs-on: ubuntu-latest
environment: production
steps:
- uses: actions/checkout@v4
- name: نشر الكناري (10% حركة مرور)
run: |
# تحديث نشر الكناري
kubectl set image deployment/inference-canary \
inference=${{ needs.build-and-push.outputs.image-tag }} \
-n ml-serving
# تحديث Istio VirtualService لـ 10% حركة مرور كناري
kubectl patch virtualservice inference-vs -n ml-serving \
--type=json \
-p='[{"op": "replace", "path": "/spec/http/0/route/1/weight", "value": 10}]'
- name: مراقبة مقاييس الكناري
run: |
python scripts/canary_monitor.py \
--duration 600 \
--error-threshold 0.01 \
--latency-threshold-p99 2.0
- name: الترقية أو التراجع
run: |
if [ "$CANARY_SUCCESS" == "true" ]; then
# الترقية إلى 100%
kubectl patch virtualservice inference-vs -n ml-serving \
--type=json \
-p='[{"op": "replace", "path": "/spec/http/0/route/1/weight", "value": 100}]'
else
# التراجع
kubectl rollout undo deployment/inference-canary -n ml-serving
fi
خط Tekton لـ ML
apiVersion: tekton.dev/v1beta1
kind: Pipeline
metadata:
name: ml-deployment-pipeline
spec:
params:
- name: git-url
type: string
- name: git-revision
type: string
default: main
- name: image-name
type: string
workspaces:
- name: shared-workspace
- name: docker-credentials
tasks:
- name: fetch-source
taskRef:
name: git-clone
params:
- name: url
value: $(params.git-url)
- name: revision
value: $(params.git-revision)
workspaces:
- name: output
workspace: shared-workspace
- name: run-tests
runAfter: [fetch-source]
taskSpec:
workspaces:
- name: source
steps:
- name: test
image: python:3.11
script: |
cd $(workspaces.source.path)
pip install -r requirements.txt
pytest tests/ -v --junitxml=test-results.xml
workspaces:
- name: source
workspace: shared-workspace
- name: validate-model
runAfter: [run-tests]
taskSpec:
workspaces:
- name: source
steps:
- name: validate
image: python:3.11
script: |
cd $(workspaces.source.path)
python scripts/validate_model.py \
--model-path models/latest \
--validation-data data/validation.csv \
--min-accuracy 0.95
workspaces:
- name: source
workspace: shared-workspace
- name: build-image
runAfter: [validate-model]
taskRef:
name: kaniko
params:
- name: IMAGE
value: $(params.image-name)
workspaces:
- name: source
workspace: shared-workspace
- name: dockerconfig
workspace: docker-credentials
- name: deploy-canary
runAfter: [build-image]
taskRef:
name: kubernetes-actions
params:
- name: script
value: |
kubectl set image deployment/inference-canary \
inference=$(params.image-name) -n ml-serving
kubectl rollout status deployment/inference-canary \
-n ml-serving --timeout=300s
- name: run-canary-analysis
runAfter: [deploy-canary]
taskSpec:
steps:
- name: analyze
image: curlimages/curl
script: |
# استعلام Prometheus لمقاييس الكناري
CANARY_ERROR_RATE=$(curl -s "prometheus:9090/api/v1/query?query=sum(rate(inference_errors_total{deployment='canary'}[10m]))")
STABLE_ERROR_RATE=$(curl -s "prometheus:9090/api/v1/query?query=sum(rate(inference_errors_total{deployment='stable'}[10m]))")
# المقارنة والقرار
if [ "$CANARY_ERROR_RATE" -gt "$STABLE_ERROR_RATE" ]; then
echo "الكناري لديه معدل أخطاء أعلى، يتم التراجع"
exit 1
fi
بوابة التحقق من النموذج
apiVersion: tekton.dev/v1beta1
kind: Task
metadata:
name: model-validation-gate
spec:
params:
- name: model-uri
type: string
- name: min-accuracy
type: string
default: "0.95"
- name: max-latency-ms
type: string
default: "100"
steps:
- name: download-model
image: amazon/aws-cli
script: |
aws s3 cp $(params.model-uri) /workspace/model
- name: validate-accuracy
image: python:3.11
script: |
pip install scikit-learn numpy
python << 'EOF'
import pickle
from sklearn.metrics import accuracy_score
with open('/workspace/model', 'rb') as f:
model = pickle.load(f)
# تحميل بيانات التحقق
X_val, y_val = load_validation_data()
predictions = model.predict(X_val)
accuracy = accuracy_score(y_val, predictions)
if accuracy < float("$(params.min-accuracy)"):
print(f"دقة النموذج {accuracy} أقل من الحد")
exit(1)
EOF
- name: validate-latency
image: python:3.11
script: |
python << 'EOF'
import time
import pickle
with open('/workspace/model', 'rb') as f:
model = pickle.load(f)
# قياس زمن الاستدلال
latencies = []
for _ in range(100):
start = time.time()
model.predict([[1, 2, 3, 4]])
latencies.append((time.time() - start) * 1000)
p99_latency = sorted(latencies)[95]
if p99_latency > float("$(params.max-latency-ms)"):
print(f"زمن P99 {p99_latency}ms يتجاوز الحد")
exit(1)
EOF
التراجع الآلي
# Argo Rollouts مع التراجع الآلي
apiVersion: argoproj.io/v1alpha1
kind: Rollout
metadata:
name: inference-rollout
spec:
strategy:
canary:
steps:
- setWeight: 10
- pause: {duration: 5m}
- analysis:
templates:
- templateName: success-rate
- templateName: latency-check
- setWeight: 50
- pause: {duration: 10m}
- analysis:
templates:
- templateName: success-rate
- setWeight: 100
canaryService: inference-canary
stableService: inference-stable
# التراجع الآلي عند الفشل
abortScaleDownDelaySeconds: 30
---
apiVersion: argoproj.io/v1alpha1
kind: AnalysisTemplate
metadata:
name: success-rate
spec:
metrics:
- name: success-rate
successCondition: result[0] >= 0.99
failureCondition: result[0] < 0.95
failureLimit: 3
provider:
prometheus:
address: http://prometheus:9090
query: |
sum(rate(inference_success_total{rollouts_pod_template_hash="{{args.canary-hash}}"}[5m])) /
sum(rate(inference_requests_total{rollouts_pod_template_hash="{{args.canary-hash}}"}[5m]))
تهانينا! لقد أكملت دورة Kubernetes للذكاء الاصطناعي/تعلم الآلة. لديك الآن المعرفة لنشر وتوسيع وتشغيل أعباء عمل ML الإنتاجية على Kubernetes. :::