Install From K8s Operator

Despite the complex binary installation, using k8s operator is a better way to install slurm.

Source code could be found from https://github.com/AaronYang0628/slurm-on-k8s

Prequisites

  1. Kubernetes has installed, if not check 🔗link
  2. Helm binary has installed, if not check 🔗link

Installation

  1. deploy slurm operator

    kubectl apply -f https://raw.githubusercontent.com/AaronYang0628/helm-chart-mirror/refs/heads/main/templates/slurm/operator_install.yaml
    Expectd Output
    [root@ay-zj-ecs operator]# kubectl apply -f https://raw.githubusercontent.com/AaronYang0628/helm-chart-mirror/refs/heads/main/templates/slurm/operator_install.yaml
    namespace/slurm created
    customresourcedefinition.apiextensions.k8s.io/slurmdeployments.slurm.ay.dev created
    serviceaccount/slurm-operator-controller-manager created
    role.rbac.authorization.k8s.io/slurm-operator-leader-election-role created
    clusterrole.rbac.authorization.k8s.io/slurm-operator-manager-role created
    clusterrole.rbac.authorization.k8s.io/slurm-operator-metrics-auth-role created
    clusterrole.rbac.authorization.k8s.io/slurm-operator-metrics-reader created
    clusterrole.rbac.authorization.k8s.io/slurm-operator-slurmdeployment-admin-role created
    clusterrole.rbac.authorization.k8s.io/slurm-operator-slurmdeployment-editor-role created
    clusterrole.rbac.authorization.k8s.io/slurm-operator-slurmdeployment-viewer-role created
    rolebinding.rbac.authorization.k8s.io/slurm-operator-leader-election-rolebinding created
    clusterrolebinding.rbac.authorization.k8s.io/slurm-operator-manager-rolebinding created
    clusterrolebinding.rbac.authorization.k8s.io/slurm-operator-metrics-auth-rolebinding created
    service/slurm-operator-controller-manager-metrics-service created
    deployment.apps/slurm-operator-controller-manager created
  2. check operator status

    kubectl -n slurm get pod
    Expectd Output
    [root@ay-zj-ecs operator]# kubectl -n slurm get pod
    NAME                                READY   STATUS    RESTARTS   AGE
    slurm-operator-controller-manager   1/1     Running   0          27s
  3. apply CRD slurmdeployment

    kubectl apply -f https://raw.githubusercontent.com/AaronYang0628/helm-chart-mirror/refs/heads/main/templates/slurm/slurmdeployment.zj.values.yaml
    Expectd Output
    [root@ay-zj-ecs operator]# kubectl apply -f https://raw.githubusercontent.com/AaronYang0628/helm-chart-mirror/refs/heads/main/templates/slurm/slurmdeployment.zj.values.yaml
    slurmdeployment.slurm.ay.dev/lensing created
  4. check operator status

    kubectl get slurmdeployment
    kubectl -n slurm logs -f deploy/slurm-operator-controller-manager
    # kubectl get slurmdep
    # kubectl -n test get pods
    Expectd Output
    [root@ay-zj-ecs ~]# kubectl get slurmdep -w
    NAME      CPU   GPU   LOGIN   CTLD   DBD   DBSVC   JOB COMMAND                     STATUS
    lensing   0/1   0/0   0/1     0/1    0/1   0/1     sh -c srun -N 2 /bin/hostname   
    lensing   1/2   0/0   1/1     1/1    1/1   1/1     sh -c srun -N 2 /bin/hostname   
    lensing   2/2   0/0   1/1     1/1    1/1   1/1     sh -c srun -N 2 /bin/hostname   
  5. upgrade slurmdep

    kubectl edit slurmdep lensing
    # set SlurmCPU.replicas = 3
    Expectd Output
    [root@ay-zj-ecs ~]# kubectl edit slurmdep lensing
    slurmdeployment.slurm.ay.dev/lensing edited
    
    [root@ay-zj-ecs ~]# kubectl get slurmdep -w
    NAME      CPU   GPU   LOGIN   CTLD   DBD   DBSVC   JOB COMMAND                     STATUS
    lensing   2/2   0/0   1/1     1/1    1/1   1/1     sh -c srun -N 2 /bin/hostname   
    lensing   2/3   0/0   1/1     1/1    1/1   1/1     sh -c srun -N 2 /bin/hostname   
    lensing   3/3   0/0   1/1     1/1    1/1   1/1     sh -c srun -N 2 /bin/hostname