Install On Debian

Cluster Setting

1 Manager, 1 Login Node and 2 Compute node:

hostnameIProlequota
manage01192.168.56.115manager2C4G
login01192.168.56.116login2C4G
compute01192.168.56.117compute2C4G
compute02192.168.56.118compute2C4G

Software Version:

softwareversion
osDebian 12 bookworm
slurm24.05.2

Prepare Steps (All Nodes)

  1. Modify the /etc/network/interfaces file (if you cannot get ipv4 address)

Append the following lines to the file

allow-hotplug enps08
iface enps08 inet dhcp

restart the network

systemctl restart networking
  1. Modify the /etc/apt/sources.list file Using tuna mirror
cat > /etc/apt/sources.list << EOF
deb https://mirrors.tuna.tsinghua.edu.cn/debian/ bookworm main contrib non-free non-free-firmware
deb-src https://mirrors.tuna.tsinghua.edu.cn/debian/ bookworm main contrib non-free non-free-firmware

deb https://mirrors.tuna.tsinghua.edu.cn/debian/ bookworm-updates main contrib non-free non-free-firmware
deb-src https://mirrors.tuna.tsinghua.edu.cn/debian/ bookworm-updates main contrib non-free non-free-firmware

deb https://mirrors.tuna.tsinghua.edu.cn/debian/ bookworm-backports main contrib non-free non-free-firmware
deb-src https://mirrors.tuna.tsinghua.edu.cn/debian/ bookworm-backports main contrib non-free non-free-firmware

deb https://mirrors.tuna.tsinghua.edu.cn/debian-security/ bookworm-security main contrib non-free non-free-firmware
deb-src https://mirrors.tuna.tsinghua.edu.cn/debian-security/ bookworm-security main contrib non-free non-free-firmware
EOF
  1. update apt cache
apt clean all && apt update
  1. set hostname on each node
Node:
hostnamectl set-hostname manage01
hostnamectl set-hostname login01
hostnamectl set-hostname compute01
hostnamectl set-hostname compute02
  1. set hosts file
cat >> /etc/hosts << EOF
192.168.56.115 manage01
192.168.56.116 login01
192.168.56.117 compute01
192.168.56.118 compute02
EOF
  1. disable firewall
systemctl stop nftables && systemctl disable nftables
  1. install packages ntpdate
apt-get -y install ntpdate

sync server time

ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime
echo 'Asia/Shanghai' >/etc/timezone
ntpdate time.windows.com
  1. add cron job to sync time
crontab -e
*/5 * * * * /usr/sbin/ntpdate time.windows.com
  1. create ssh key pair on each node
ssh-keygen -t rsa -b 4096 -C $HOSTNAME
  1. ssh login without password [All Node]
Node:
ssh-copy-id -i ~/.ssh/id_rsa.pub root@login01
ssh-copy-id -i ~/.ssh/id_rsa.pub root@compute01
ssh-copy-id -i ~/.ssh/id_rsa.pub root@compute02
ssh-copy-id -i ~/.ssh/id_rsa.pub root@manage01
ssh-copy-id -i ~/.ssh/id_rsa.pub root@compute01
ssh-copy-id -i ~/.ssh/id_rsa.pub root@compute02

Install Components

  1. Install NFS server (Manager Node)

there are many ways to install NFS server

create shared folder

mkdir /data
chmod 755 /data

modify vim /etc/exports

/data *(rw,sync,insecure,no_subtree_check,no_root_squash)

start nfs server

systemctl start rpcbind 
systemctl start nfs-server 

systemctl enable rpcbind 
systemctl enable nfs-server

check nfs server

showmount -e localhost

# Output
Export list for localhost:
/data *
  1. Install munge service
  • add user munge (All Nodes)
groupadd -g 1108 munge
useradd -m -c "Munge Uid 'N' Gid Emporium" -d /var/lib/munge -u 1108 -g munge -s /sbin/nologin munge
  • Install rng-tools-debian (Manager Nodes)
apt-get install -y rng-tools-debian
# modify service script
vim /usr/lib/systemd/system/rngd.service
[Service]
ExecStart=/usr/sbin/rngd -f -r /dev/urandom
systemctl daemon-reload
systemctl start rngd
systemctl enable rngd
apt-get install -y libmunge-dev libmunge2 munge
  • generate secret key (Manager Nodes)
dd if=/dev/urandom bs=1 count=1024 > /etc/munge/munge.key
  • copy munge.key from manager node to the rest node (All Nodes)
scp -p /etc/munge/munge.key root@login01:/etc/munge/
scp -p /etc/munge/munge.key root@compute01:/etc/munge/
scp -p /etc/munge/munge.key root@compute02:/etc/munge/
  • grant privilege on munge.key (All Nodes)
chown munge: /etc/munge/munge.key
chmod 400 /etc/munge/munge.key

systemctl start munge
systemctl enable munge

Using systemctl status munge to check if the service is running

  • test munge
munge -n | ssh compute01 unmunge
  1. Install Mariadb (Manager Nodes)
apt-get install -y mariadb-server
  • create database and user
systemctl start mariadb
systemctl enable mariadb

ROOT_PASS=$(tr -dc A-Za-z0-9 </dev/urandom | head -c 16) 
mysql -e "CREATE USER root IDENTIFIED BY '${ROOT_PASS}'"
mysql -uroot -p$ROOT_PASS -e 'create database slurm_acct_db'
  • create user slurm,and grant all privileges on database slurm_acct_db
mysql -uroot -p$ROOT_PASS
create user slurm;

grant all on slurm_acct_db.* TO 'slurm'@'localhost' identified by '123456' with grant option;

flush privileges;
  • create Slurm user
groupadd -g 1109 slurm
useradd -m -c "Slurm manager" -d /var/lib/slurm -u 1109 -g slurm -s /bin/bash slurm

Install Slurm (All Nodes)

  • Install basic Debian package build requirements:
apt-get install -y build-essential fakeroot devscripts equivs
  • Unpack the distributed tarball:
wget https://download.schedmd.com/slurm/slurm-24.05.2.tar.bz2 -O slurm-24.05.2.tar.bz2 &&
tar -xaf slurm*tar.bz2
  • cd to the directory containing the Slurm source:
cd slurm-24.05.2 &&   mkdir -p /etc/slurm && ./configure 
  • compile slurm
make install
  • modify configuration files (Manager Nodes)

    cp /root/slurm-24.05.2/etc/slurm.conf.example /etc/slurm/slurm.conf
    vim /etc/slurm/slurm.conf

    focus on these options:

    SlurmctldHost=manage
    
    AccountingStorageEnforce=associations,limits,qos
    AccountingStorageHost=manage
    AccountingStoragePass=/var/run/munge/munge.socket.2
    AccountingStoragePort=6819  
    AccountingStorageType=accounting_storage/slurmdbd  
    
    JobCompHost=localhost
    JobCompLoc=slurm_acct_db
    JobCompPass=123456
    JobCompPort=3306
    JobCompType=jobcomp/mysql
    JobCompUser=slurm
    JobContainerType=job_container/none
    JobAcctGatherType=jobacct_gather/linux
    cp /root/slurm-24.05.2/etc/slurmdbd.conf.example /etc/slurm/slurmdbd.conf
    vim /etc/slurm/slurmdbd.conf
    • modify /etc/slurm/cgroup.conf
    cp /root/slurm-24.05.2/etc/cgroup.conf.example /etc/slurm/cgroup.conf
    • send configuration files to other nodes
    scp -r /etc/slurm/*.conf  root@login01:/etc/slurm/
    scp -r /etc/slurm/*.conf  root@compute01:/etc/slurm/
    scp -r /etc/slurm/*.conf  root@compute02:/etc/slurm/
  • grant privilege on some directories (All Nodes)

mkdir /var/spool/slurmd
chown slurm: /var/spool/slurmd
mkdir /var/log/slurm
chown slurm: /var/log/slurm

mkdir /var/spool/slurmctld
chown slurm: /var/spool/slurmctld

chown slurm: /etc/slurm/slurmdbd.conf
chmod 600 /etc/slurm/slurmdbd.conf
  • start slurm services on each node
Node:
systemctl start slurmdbd
systemctl enable slurmdbd

systemctl start slurmctld
systemctl enable slurmctld

systemctl start slurmd
systemctl enable slurmd
Using `systemctl status xxxx` to check if the `xxxx` service is running
```text
# vim /usr/lib/systemd/system/slurmdbd.service


[Unit]
Description=Slurm DBD accounting daemon
After=network-online.target remote-fs.target munge.service mysql.service mysqld.service mariadb.service sssd.service
Wants=network-online.target
ConditionPathExists=/etc/slurm/slurmdbd.conf

[Service]
Type=simple
EnvironmentFile=-/etc/sysconfig/slurmdbd
EnvironmentFile=-/etc/default/slurmdbd
User=slurm
Group=slurm
RuntimeDirectory=slurmdbd
RuntimeDirectoryMode=0755
ExecStart=/usr/local/sbin/slurmdbd -D -s $SLURMDBD_OPTIONS
ExecReload=/bin/kill -HUP $MAINPID
LimitNOFILE=65536


# Uncomment the following lines to disable logging through journald.
# NOTE: It may be preferable to set these through an override file instead.
#StandardOutput=null
#StandardError=null

[Install]
WantedBy=multi-user.target
```
```text
# vim /usr/lib/systemd/system/slurmctld.service


[Unit]
Description=Slurm controller daemon
After=network-online.target remote-fs.target munge.service sssd.service
Wants=network-online.target
ConditionPathExists=/etc/slurm/slurm.conf

[Service]
Type=notify
EnvironmentFile=-/etc/sysconfig/slurmctld
EnvironmentFile=-/etc/default/slurmctld
User=slurm
Group=slurm
RuntimeDirectory=slurmctld
RuntimeDirectoryMode=0755
ExecStart=/usr/local/sbin/slurmctld --systemd $SLURMCTLD_OPTIONS
ExecReload=/bin/kill -HUP $MAINPID
LimitNOFILE=65536


# Uncomment the following lines to disable logging through journald.
# NOTE: It may be preferable to set these through an override file instead.
#StandardOutput=null
#StandardError=null

[Install]
WantedBy=multi-user.target
```
```text
# vim /usr/lib/systemd/system/slurmd.service


[Unit]
Description=Slurm node daemon
After=munge.service network-online.target remote-fs.target sssd.service
Wants=network-online.target
#ConditionPathExists=/etc/slurm/slurm.conf

[Service]
Type=notify
EnvironmentFile=-/etc/sysconfig/slurmd
EnvironmentFile=-/etc/default/slurmd
RuntimeDirectory=slurm
RuntimeDirectoryMode=0755
ExecStart=/usr/local/sbin/slurmd --systemd $SLURMD_OPTIONS
ExecReload=/bin/kill -HUP $MAINPID
KillMode=process
LimitNOFILE=131072
LimitMEMLOCK=infinity
LimitSTACK=infinity
Delegate=yes


# Uncomment the following lines to disable logging through journald.
# NOTE: It may be preferable to set these through an override file instead.
#StandardOutput=null
#StandardError=null

[Install]
WantedBy=multi-user.target
```
systemctl start slurmd
systemctl enable slurmd
Using `systemctl status slurmd` to check if the `slurmd` service is running
systemctl start slurmd
systemctl enable slurmd
Using `systemctl status slurmd` to check if the `slurmd` service is running
systemctl start slurmd
systemctl enable slurmd
Using `systemctl status slurmd` to check if the `slurmd` service is running
  • test slurm check cluster configuration
scontrol show config

check cluster status

sinfo
scontrol show partition
scontrol show node

submit job

srun -N2 hostname
scontrol show jobs
check job status
squeue -a