Skip to content

[PyTorch] [package list]

PyTorch/2.2.0-rocm-5.6.1-python-3.10-singularity-20240208 (PyTorch-2.2.0-rocm-5.6.1-python-3.10-singularity-20240208.eb)

Install with the EasyBuild-user module in partition/container:

module load LUMI partition/container EasyBuild-user
eb PyTorch-2.2.0-rocm-5.6.1-python-3.10-singularity-20240208.eb
The module will be available in all versions of the LUMI stack and in the CrayEnv stack.

To access module help after installation use module spider PyTorch/2.2.0-rocm-5.6.1-python-3.10-singularity-20240208.

EasyConfig:

# Developed by Kurt Lust and Mihkel Tiks for LUMI
#DOC Contains PyTorch 2.2.0 with torchaudio 2.2.0, torchdata 0.7.1+cpu, torchtext 0.17.0+cpu,
#DOC torchvision 0.17.0 GPU version and DeepSpeed 0.12.3,  on Python 3.10 and ROCm 5.6.1.

easyblock = 'MakeCp'

name =          'PyTorch'
version =       '2.2.0'
versionsuffix = '-rocm-5.6.1-python-3.10-singularity-20240208'

local_sif =    'lumi-pytorch-rocm-5.6.1-python-3.10-pytorch-v2.2.0-dockerhash-f421797bd407.sif'
#local_docker = 'lumi-pytorch-rocm-5.6.1-python-3.10-pytorch-v2.1.0.docker'

homepage = 'https://pytorch.org/'

whatis = [
    'Description: PyTorch, a machine learning package',
    'Keywords: PyTorch, DeepSpeed, flash-attention'
]

description = """
This module provides a container with PyTorch %(version)s. It also
contains DeepSpeed 0.12.3 and flash-attention 2.0.4.

The module defines a number of environment variables:
*   SIF and SIFPYTORCH: The full path and name of the Singularity SIF file 
    to use with singularity exec etc.
*   SINGULAIRTY_BINDPATH: Mounts the necessary directories from the system,
    including /users, /project, /scratch and /flash so that you should be
    able to use your regular directories in the container.
*   RUNSCRIPTS and RUNSCRIPTSPYTORCH: The directory with some sample
    runscripts.

Note that this container uses a Conda environment internally. When in
the container, the command to activate the container is contained in the
environment variable WITH_CONDA.
"""

docurls = [
    'DeepSpeed web site: https://www.deepspeed.ai/'    
]

toolchain = SYSTEM

sources = [
    {
        'filename':    local_sif,
        'extract_cmd': '/bin/cp -L %s .'
    },
#    {
#        'filename':    local_docker,
#        'extract_cmd': '/bin/cp -L %s .'
#    },
]

skipsteps = ['build']

files_to_copy = [
    ([local_sif],    '.'),
#    ([local_docker], 'share/docker-defs/')    
]

local_runscript_python_simple="""
#!/bin/bash -e

# Start conda environment inside the container
\$WITH_CONDA

# Run application
python "\$@"

"""

local_runscript_python_distributed="""
#!/bin/bash -e

# Make sure GPUs are up
if [ \$SLURM_LOCALID -eq 0 ] ; then
    rocm-smi
fi
sleep 2

export MIOPEN_USER_DB_PATH="/tmp/\$(whoami)-miopen-cache-\$SLURM_NODEID"
export MIOPEN_CUSTOM_CACHE_DIR=\$MIOPEN_USER_DB_PATH

# Set MIOpen cache to a temporary folder.
if [ \$SLURM_LOCALID -eq 0 ] ; then
    rm -rf \$MIOPEN_USER_DB_PATH
    mkdir -p \$MIOPEN_USER_DB_PATH
fi
sleep 2

# Report affinity
echo "Rank \$SLURM_PROCID --> \$(taskset -p \$\$)"

# Start conda environment inside the container
\$WITH_CONDA

# Set interfaces to be used by RCCL.
export NCCL_SOCKET_IFNAME=hsn0,hsn1,hsn2,hsn3
export NCCL_NET_GDR_LEVEL=3

# Set environment for the app
export MASTER_ADDR=\$(/runscripts/get-master "\$SLURM_NODELIST")
export MASTER_PORT=29500
export WORLD_SIZE=\$SLURM_NPROCS
export RANK=\$SLURM_PROCID
export ROCR_VISIBLE_DEVICES=\$SLURM_LOCALID

# Run application
python "\$@"

"""

local_runscript_get_master="""
#!/usr/bin/python3
# This is the correct Python path both on LUMI and in the container, but the script
# should really be used in the container.

import argparse
def get_parser():
    parser = argparse.ArgumentParser(description="Extract master node name from Slurm node list",
            formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("nodelist", help="Slurm nodelist")
    return parser


if __name__ == '__main__':
    parser = get_parser()
    args = parser.parse_args()

    first_nodelist = args.nodelist.split(',')[0]

    if '[' in first_nodelist:
        a = first_nodelist.split('[')
        first_node = a[0] + a[1].split('-')[0]

    else:
        first_node = first_nodelist

    print(first_node)

"""

postinstallcmds = [
    'mkdir -p %(installdir)s/runscripts',
    f'cat >%(installdir)s/runscripts/conda-python-simple <<EOF {local_runscript_python_simple}EOF',
    'chmod a+x %(installdir)s/runscripts/conda-python-simple',
    f'cat >%(installdir)s/runscripts/conda-python-distributed <<EOF {local_runscript_python_distributed}EOF',
    'chmod a+x %(installdir)s/runscripts/conda-python-distributed',
    f'cat >%(installdir)s/runscripts/get-master <<EOF {local_runscript_get_master}EOF',
    'chmod a+x %(installdir)s/runscripts/get-master',
]

sanity_check_paths = {
    # We deliberately don't check for local_sif as the user is allowed to remove that file
    # but may still want to regenerate the module which would then fail in the sanity check.
    #'files': [f'share/docker-defs/{local_docker}'],
    'files': [],
    'dirs':  ['runscripts'],
}

modextravars = {
    # SIF variables currently set by a function via modluafooter.
    #'SIF':                        '%(installdir)s/' + local_sif,
    #'SIFPYTORCH':                 '%(installdir)s/' + local_sif,
    'RUNSCRIPTS':                  '%(installdir)s/runscripts',
    'RUNSCRIPTSPYTORCH':           '%(installdir)s/runscripts',
    'SINGULARITY_BIND':            '/var/spool/slurmd,/opt/cray,/usr/lib64/libcxi.so.1,/usr/lib64/libjansson.so.4,' +
                                   '%(installdir)s/runscripts:/runscripts,' + 
                                   '/pfs,/scratch,/projappl,/project,/flash,/appl',
    'SINGULARITYENV_PREPEND_PATH': '/runscripts',
}

modluafooter = f"""
-- Call a routine to set the various environment variables.
create_container_vars( '{local_sif}', 'PyTorch', '%(installdir)s' )
"""

moduleclass = 'devel'

[PyTorch] [package list]