Free ebooks Library zlib project

Slurm Examples

The first example runs a stress test on a compute node:
#!/bin/bash 
 
#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
#                     Slurm Construction Section
#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::

# job name
#SBATCH --job-name=job-1

# partition (queue) declaration
#SBATCH --partition=dept_24

# number of requested nodes
#SBATCH --nodes=1

# number of tasks
#SBATCH --ntasks=1

# number of requested cores
#SBATCH --ntasks-per-node=24

# call a Slurm Feature
# #SBATCH --constraint=4C

# ask a specific node to run the job
# #SBATCH --nodelist=n096

# requested runtime
# #SBATCH --time=00:05:00 

#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
#                     User Construction Section
#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::

# current (working) directory
work_dir=$(pwd)

# username
user=$(whoami)

# directory name where job will be run (on compute node)
job_dir="${user}_${SLURM_JOB_ID}.dcb.private.net"

# creating directory on /scr folder of compute node
mkdir /scr/$job_dir

# change to the newly created directory
cd /scr/$job_dir

# copy the submit file (and all other related files/directories)
rsync -a ${work_dir}/* .

# put date and time of starting job in a file
date > date.txt

# put hostname of compute node in a file
hostname > hostname.txt

# copy files on exit or interrupt
trap "echo 'copying files'; rsync -avz *.log *.txt ${work_dir}" EXIT

# runs stress-ng (to put stress on node) for 120 seconds
stress-ng --cpu $SLURM_TASKS_PER_NODE --timeout 120s --metrics-brief > stress-ng.log

# append date and time of finished job in a file
date >> date.txt

 

The second example demonstrates how to run an array of jobs:
#!/bin/bash 
  
#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
#                     Slurm Construction Section
#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::

# job name
#SBATCH --job-name=job-1

# partition (queue) declaration
#SBATCH --partition=dept_24

# number of requested nodes
#SBATCH --nodes=1

# requested array dimension
#SBATCH --array=1-4

# number of tasks
#SBATCH --ntasks=1

# number of requested cores
#SBATCH --ntasks-per-node=2

#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
#                     User Construction Section
#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::

# current (working) direcotry
work_dir=$(pwd)

# username
user=$(whoami)

# directory name where job will be run (on compute node)
job_dir="${user}_${SLURM_JOB_ID}.dcb.private.net"

# create directory on /scr folder of compute node
mkdir /scr/$job_dir

# change to the newly created directory
cd /scr/$job_dir

# copy the submit file (and all other related files/directories)
rsync -a ${work_dir}/* .

# put date and time of starting job in a file
date > date-$SLURM_ARRAY_TASK_ID.txt

# copy files on exit or interrupt
trap "echo 'copying files'; rsync -avz *.log *.txt ${work_dir}" EXIT

# runs stress-ng (to put stress on node) for 120 seconds 
stress-ng --cpu $SLURM_TASKS_PER_NODE --timeout 60s --metrics-brief > stress-ng-$SLURM_ARRAY_TASK_ID.log $SLURM_ARRAY_TASK_ID

# put hostname of compute node in a file
hostname > hostname-$SLURM_ARRAY_TASK_ID.txt

# append date and time of finished job in a file
date >> date-$SLURM_ARRAY_TASK_ID.txt