ArgoCD لنشر ML

ArgoCD يمكّن التسليم المستمر القائم على GitOps لمنصات ML، مما يضمن نشراً تصريحياً ومُتحكماً بالإصدارات. عادةً ما تشهد المنظمات التي تتبنى GitOps دورات إصدار أسرع بشكل ملحوظ وتحسين الامتثال للتدقيق.

بنية GitOps لـ ML

┌─────────────────────────────────────────────────────────────────────┐
│                    GitOps لمنصة ML                                   │
├─────────────────────────────────────────────────────────────────────┤
│                                                                      │
│  ┌─────────────────────────────────────────────────────────────┐    │
│  │                    مستودع Git (مصدر الحقيقة)                 │    │
│  │  ┌──────────────┐  ┌──────────────┐  ┌──────────────┐       │    │
│  │  │ تكوينات     │  │ تكوينات     │  │ تكوينات     │       │    │
│  │  │ النماذج     │  │ البنية التحتية│  │ التطبيقات   │       │    │
│  │  └──────────────┘  └──────────────┘  └──────────────┘       │    │
│  └─────────────────────────────────────────────────────────────┘    │
│                              │                                       │
│                        مراجعة PR + الموافقة                          │
│                              ↓                                       │
│  ┌─────────────────────────────────────────────────────────────┐    │
│  │                    متحكم ArgoCD                              │    │
│  │  - مراقبة حالة المزامنة                                      │    │
│  │  - اكتشاف الانحراف                                          │    │
│  │  - التراجع الآلي                                             │    │
│  └─────────────────────────────────────────────────────────────┘    │
│                              │                                       │
│                              ↓                                       │
│  ┌─────────────────────────────────────────────────────────────┐    │
│  │                  مجموعة Kubernetes                           │    │
│  │  [خدمة ML] [التدريب] [المراقبة] [مخزن الميزات]              │    │
│  └─────────────────────────────────────────────────────────────┘    │
│                                                                      │
└─────────────────────────────────────────────────────────────────────┘

تثبيت ArgoCD

# تثبيت ArgoCD
kubectl create namespace argocd
kubectl apply -n argocd -f https://raw.githubusercontent.com/argoproj/argo-cd/stable/manifests/install.yaml

# الحصول على كلمة مرور المسؤول الأولية
kubectl -n argocd get secret argocd-initial-admin-secret -o jsonpath="{.data.password}" | base64 -d

# تثبيت ArgoCD CLI
brew install argocd

# تسجيل الدخول
argocd login argocd-server.argocd.svc.cluster.local

تطبيق منصة ML

# تطبيق ArgoCD لمنصة ML
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
  name: ml-platform
  namespace: argocd
  finalizers:
  - resources-finalizer.argocd.argoproj.io
spec:
  project: ml-production
  source:
    repoURL: https://github.com/org/ml-platform-config
    targetRevision: main
    path: environments/production
    helm:
      valueFiles:
      - values.yaml
      - values-production.yaml
  destination:
    server: https://kubernetes.default.svc
    namespace: ml-serving
  syncPolicy:
    automated:
      prune: true
      selfHeal: true
      allowEmpty: false
    syncOptions:
    - CreateNamespace=true
    - PruneLast=true
    - ApplyOutOfSyncOnly=true
    retry:
      limit: 5
      backoff:
        duration: 5s
        factor: 2
        maxDuration: 3m
---
# مشروع ArgoCD مع RBAC
apiVersion: argoproj.io/v1alpha1
kind: AppProject
metadata:
  name: ml-production
  namespace: argocd
spec:
  description: إنتاج منصة ML
  sourceRepos:
  - https://github.com/org/ml-platform-config
  - https://github.com/org/ml-models
  destinations:
  - namespace: ml-serving
    server: https://kubernetes.default.svc
  - namespace: ml-training
    server: https://kubernetes.default.svc
  clusterResourceWhitelist:
  - group: ''
    kind: Namespace
  - group: 'apiextensions.k8s.io'
    kind: CustomResourceDefinition
  namespaceResourceWhitelist:
  - group: ''
    kind: '*'
  - group: 'apps'
    kind: '*'
  - group: 'serving.kserve.io'
    kind: '*'

نشر النماذج مع GitOps

# models/llm-service/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: ml-serving

resources:
- inference-service.yaml
- hpa.yaml
- pdb.yaml

configMapGenerator:
- name: model-config
  literals:
  - MODEL_VERSION=v2.1.0
  - MAX_BATCH_SIZE=32
  - ENABLE_CACHING=true

images:
- name: inference-server
  newTag: v2.1.0

commonLabels:
  app.kubernetes.io/managed-by: argocd
  model.ml/version: v2.1.0
---
# models/llm-service/inference-service.yaml
apiVersion: serving.kserve.io/v1beta1
kind: InferenceService
metadata:
  name: llm-service
  annotations:
    argocd.argoproj.io/sync-wave: "2"
spec:
  predictor:
    minReplicas: 2
    maxReplicas: 10
    model:
      modelFormat:
        name: pytorch
      storageUri: "s3://models/llm/v2.1.0"
      resources:
        requests:
          nvidia.com/gpu: 1

الإطلاق التدريجي مع Argo Rollouts

apiVersion: argoproj.io/v1alpha1
kind: Rollout
metadata:
  name: inference-rollout
  namespace: ml-serving
  annotations:
    argocd.argoproj.io/sync-wave: "3"
spec:
  replicas: 10
  selector:
    matchLabels:
      app: inference
  template:
    metadata:
      labels:
        app: inference
    spec:
      containers:
      - name: inference
        image: inference:v2
        resources:
          limits:
            nvidia.com/gpu: 1
  strategy:
    canary:
      steps:
      - setWeight: 10
      - pause: {duration: 5m}
      - analysis:
          templates:
          - templateName: inference-success-rate
          args:
          - name: service-name
            value: inference-canary
      - setWeight: 30
      - pause: {duration: 10m}
      - analysis:
          templates:
          - templateName: inference-latency
      - setWeight: 50
      - pause: {duration: 10m}
      - setWeight: 100
      canaryService: inference-canary
      stableService: inference-stable
      trafficRouting:
        istio:
          virtualService:
            name: inference-vsvc
            routes:
            - primary
---
apiVersion: argoproj.io/v1alpha1
kind: AnalysisTemplate
metadata:
  name: inference-success-rate
spec:
  args:
  - name: service-name
  metrics:
  - name: success-rate
    interval: 1m
    count: 5
    successCondition: result[0] >= 0.99
    failureLimit: 2
    provider:
      prometheus:
        address: http://prometheus:9090
        query: |
          sum(rate(inference_requests_total{service="{{args.service-name}}",status="success"}[5m])) /
          sum(rate(inference_requests_total{service="{{args.service-name}}"}[5m]))

موجات المزامنة لمكونات ML

# ترتيب النشر باستخدام موجات المزامنة
# الموجة 0: مساحات الأسماء وRBAC
apiVersion: v1
kind: Namespace
metadata:
  name: ml-serving
  annotations:
    argocd.argoproj.io/sync-wave: "0"
---
# الموجة 1: ConfigMaps والأسرار
apiVersion: v1
kind: Secret
metadata:
  name: model-credentials
  annotations:
    argocd.argoproj.io/sync-wave: "1"
---
# الموجة 2: التخزين (PVCs)
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
  name: model-cache
  annotations:
    argocd.argoproj.io/sync-wave: "2"
---
# الموجة 3: الخدمات
apiVersion: v1
kind: Service
metadata:
  name: inference-service
  annotations:
    argocd.argoproj.io/sync-wave: "3"
---
# الموجة 4: النشر
apiVersion: apps/v1
kind: Deployment
metadata:
  name: inference
  annotations:
    argocd.argoproj.io/sync-wave: "4"
---
# الموجة 5: التوسع التلقائي
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
  name: inference-hpa
  annotations:
    argocd.argoproj.io/sync-wave: "5"

إشعارات ArgoCD لـ ML

apiVersion: v1
kind: ConfigMap
metadata:
  name: argocd-notifications-cm
  namespace: argocd
data:
  trigger.on-deployed: |
    - description: اكتمل نشر النموذج
      send: [slack-ml-team]
      when: app.status.operationState.phase in ['Succeeded']

  trigger.on-sync-failed: |
    - description: فشلت مزامنة النموذج
      send: [slack-ml-team, pagerduty]
      when: app.status.operationState.phase in ['Error', 'Failed']

  template.slack-ml-team: |
    message: |
      {{if eq .app.status.operationState.phase "Succeeded"}}:white_check_mark:{{end}}
      {{if eq .app.status.operationState.phase "Failed"}}:x:{{end}}
      التطبيق {{.app.metadata.name}} المزامنة {{.app.status.operationState.phase}}
      المراجعة: {{.app.status.sync.revision}}
      {{range .app.status.operationState.syncResult.resources}}
      - {{.kind}}/{{.name}}: {{.status}}
      {{end}}

الدرس التالي: المراقبة والتنبيه لأنظمة ML الإنتاجية. :::