PyTorch/2.1.0-rocm-5.6.1-python-3.10-singularity-20231123 (PyTorch-2.1.0-rocm-5.6.1-python-3.10-singularity-20231123.eb)
Install with the EasyBuild-user module in partition/container
:
module load LUMI partition/container EasyBuild-user
eb PyTorch-2.1.0-rocm-5.6.1-python-3.10-singularity-20231123.eb
To access module help after installation use module spider PyTorch/2.1.0-rocm-5.6.1-python-3.10-singularity-20231123
.
EasyConfig:
# Developed by Kurt Lust and Mihkel Tiks for LUMI
#DOC Contains PyTorch 2.1.0 with torchaudio 2.1.0+420d9ac, torchdata 0.6.1+e1feeb2, torchtext 0.15.2a0+4571036,
#DOC torchvision 0.16.0+a90e584 GPU version, on Python 3.10 and ROCm 5.6.1.
easyblock = 'MakeCp'
name = 'PyTorch'
version = '2.1.0'
versionsuffix = '-rocm-5.6.1-python-3.10-singularity-20231123'
local_sif = 'lumi-pytorch-rocm-5.6.1-python-3.10-pytorch-v2.1.0-dockerhash-10b8c0aaec73.sif'
#local_docker = 'lumi-pytorch-rocm-5.6.1-python-3.10-pytorch-v2.1.0.docker'
homepage = 'https://pytorch.org/'
whatis = [
'Description: PyTorch, a machine learning package'
]
description = """
This module provides a container with PyTorch %(version)s. It also
contains DeepSpeed 0.12.3.
The module defines a number of environment variables:
* SIF and SIFPYTORCH: The full path and name of the Singularity SIF file
to use with singularity exec etc.
* SINGULAIRTY_BINDPATH: Mounts the necessary directories from the system,
including /users, /project, /scratch and /flash so that you should be
able to use your regular directories in the container.
* RUNSCRIPTS and RUNSCRIPTSPYTORCH: The directory with some sample
runscripts.
Note that this container uses a Conda environment internally. When in
the container, the command to activate the container is contained in the
environment variable WITH_CONDA.
"""
docurls = [
'DeepSpeed web site: https://www.deepspeed.ai/'
]
toolchain = SYSTEM
sources = [
{
'filename': local_sif,
'extract_cmd': '/bin/cp -L %s .'
},
# {
# 'filename': local_docker,
# 'extract_cmd': '/bin/cp -L %s .'
# },
]
skipsteps = ['build']
files_to_copy = [
([local_sif], '.'),
# ([local_docker], 'share/docker-defs/')
]
local_runscript_python_simple="""
#!/bin/bash -e
# Start conda environment inside the container
\$WITH_CONDA
# Run application
python "\$@"
"""
local_runscript_python_distributed="""
#!/bin/bash -e
# Make sure GPUs are up
if [ \$SLURM_LOCALID -eq 0 ] ; then
rocm-smi
fi
sleep 2
export MIOPEN_USER_DB_PATH="/tmp/\$(whoami)-miopen-cache-\$SLURM_NODEID"
export MIOPEN_CUSTOM_CACHE_DIR=\$MIOPEN_USER_DB_PATH
# Set MIOpen cache to a temporary folder.
if [ \$SLURM_LOCALID -eq 0 ] ; then
rm -rf \$MIOPEN_USER_DB_PATH
mkdir -p \$MIOPEN_USER_DB_PATH
fi
sleep 2
# Report affinity
echo "Rank \$SLURM_PROCID --> \$(taskset -p \$\$)"
# Start conda environment inside the container
\$WITH_CONDA
# Set interfaces to be used by RCCL.
export NCCL_SOCKET_IFNAME=hsn0,hsn1,hsn2,hsn3
export NCCL_NET_GDR_LEVEL=3
# Set environment for the app
export MASTER_ADDR=\$(/runscripts/get-master "\$SLURM_NODELIST")
export MASTER_PORT=29500
export WORLD_SIZE=\$SLURM_NPROCS
export RANK=\$SLURM_PROCID
export ROCR_VISIBLE_DEVICES=\$SLURM_LOCALID
# Run application
python "\$@"
"""
local_runscript_get_master="""
#!/usr/bin/python3
# This is the correct Python path both on LUMI and in the container, but the script
# should really be used in the container.
import argparse
def get_parser():
parser = argparse.ArgumentParser(description="Extract master node name from Slurm node list",
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("nodelist", help="Slurm nodelist")
return parser
if __name__ == '__main__':
parser = get_parser()
args = parser.parse_args()
first_nodelist = args.nodelist.split(',')[0]
if '[' in first_nodelist:
a = first_nodelist.split('[')
first_node = a[0] + a[1].split('-')[0]
else:
first_node = first_nodelist
print(first_node)
"""
postinstallcmds = [
'mkdir -p %(installdir)s/runscripts',
f'cat >%(installdir)s/runscripts/conda-python-simple <<EOF {local_runscript_python_simple}EOF',
'chmod a+x %(installdir)s/runscripts/conda-python-simple',
f'cat >%(installdir)s/runscripts/conda-python-distributed <<EOF {local_runscript_python_distributed}EOF',
'chmod a+x %(installdir)s/runscripts/conda-python-distributed',
f'cat >%(installdir)s/runscripts/get-master <<EOF {local_runscript_get_master}EOF',
'chmod a+x %(installdir)s/runscripts/get-master',
]
sanity_check_paths = {
# We deliberately don't check for local_sif as the user is allowed to remove that file
# but may still want to regenerate the module which would then fail in the sanity check.
#'files': [f'share/docker-defs/{local_docker}'],
'files': [],
'dirs': ['runscripts'],
}
modextravars = {
# SIF variables currently set by a function via modluafooter.
#'SIF': '%(installdir)s/' + local_sif,
#'SIFPYTORCH': '%(installdir)s/' + local_sif,
'RUNSCRIPTS': '%(installdir)s/runscripts',
'RUNSCRIPTSPYTORCH': '%(installdir)s/runscripts',
'SINGULARITY_BIND': '/var/spool/slurmd,/opt/cray,/usr/lib64/libcxi.so.1,/usr/lib64/libjansson.so.4,' +
'%(installdir)s/runscripts:/runscripts,' +
'/pfs,/scratch,/projappl,/project,/flash,/appl',
'SINGULARITYENV_PREPEND_PATH': '/runscripts',
}
modluafooter = f"""
-- Call a routine to set the various environment variables.
create_container_vars( '{local_sif}', 'PyTorch', '%(installdir)s' )
"""
moduleclass = 'devel'