Run NAMD on GPU

Click Start Over at the left bottom to start Back to Contents

1. Compiling NAMD at SGI clusters

Check resources

> module load cuda

> nvcc --version

cuda9.1 or above is required.

Following the following steps to compile NAMD

Download newest version source code at NAMD

> tar -xzf NAMD_2.14_Source.tar.gz

> cd NAMD_2.14_Source

> tar -xf charm-6.10.2.tar

> cd charm-6.10.2

Set compiling environment

Look at modules required for running Intel MPI Script

> module list
> module avail 
> module purge
> module load compiler/intel/2020.2.254
> module load compiler/intelmpi/2020.2.254
or 
> module swap mpt compiler/intelmpi

Build charm++ of non-smp and smp version

> ./build charm++ ofi-linux-x86_64 icc ifort -j16 --with-production

> ./build charm++ ofi-linux-x86_64 icc ifort smp -j16 --with-production

Test the compilations of charm++

Notice that before each new test the previous binaries must be deleted.

> cd ofi-linux-x86_64-ifort-smp-icc/tests/charm++
> make clean && make && make test
> cd ../../..
> cd ofi-linux-x86_64-ifort-icc/tests/charm++
> make clean && make && make test
> cd ../../..
> cd ofi-linux-x86_64-ifort-smp-icc/tests/charm++
> make clean && make
> cd megatest && make && make test
cd ../../../../..

Download tcl and fftw

> wget http://www.ks.uiuc.edu/Research/namd/libraries/fftw-linux-x86_64.tar.gz
> wget http://www.ks.uiuc.edu/Research/namd/libraries/tcl8.5.9-linux-x86_64.tar.gz
> wget http://www.ks.uiuc.edu/Research/namd/libraries/tcl8.5.9-linux-x86_64-threaded.tar.gz
> tar xzf fftw-linux-x86_64.tar.gz
> mv linux-x86_64 fftw
> tar xzf tcl8.5.9-linux-x86_64.tar.gz
> mv tcl8.5.9-linux-x86_64 tcl
> tar xzf tcl8.5.9-linux-x86_64-threaded.tar.gz
> mv tcl8.5.9-linux-x86_64-threaded tcl-threaded

Create charm.arch files

> cat << EOF > arch/Linux-x86_64-ofi-icc.arch
NAMD_ARCH = Linux-x86_64
CHARMARCH = ofi-linux-x86_64-ifort-icc

FLOATOPTS = -ip -xSKYLAKE-AVX512 -qopenmp-simd

CXX = icpc -std=c++11
CXXOPTS = -O2 \$(FLOATOPTS)
CXXNOALIASOPTS = -O2 -fno-alias \$(FLOATOPTS)
CXXCOLVAROPTS = -O2 -ip

CC = icc
COPTS = -O2 \$(FLOATOPTS)
EOF

> cat << EOF > arch/Linux-x86_64-ofi-smp-icc.arch
NAMD_ARCH = Linux-x86_64
CHARMARCH = ofi-linux-x86_64-ifort-smp-icc

FLOATOPTS = -ip -xSKYLAKE-AVX512 -qopenmp-simd

CXX = icpc -std=c++11
CXXOPTS = -O2 \$(FLOATOPTS)
CXXNOALIASOPTS = -O2 -fno-alias \$(FLOATOPTS)
CXXCOLVAROPTS = -O2 -ip

CC = icc
COPTS = -O2 \$(FLOATOPTS)
EOF

Compile NAMD


> ./config Linux-x86_64-ofi-icc --charm-arch ofi-linux-x86_64-ifort-icc
> ./config Linux-x86_64-ofi-smp-icc --charm-arch ofi-linux-x86_64-ifort-smp-icc
> cd Linux-x86_64-ofi-icc
> make -j16
> cd ..
> cd Linux-x86_64-ofi-smp-icc
> make -j16
> cd ..
> module load cuda/9.1
> ./config Linux-x86_64-ofi-smp-icc.cuda --charm-arch ofi-linux-x86_64-ifort-smp-icc --with-cuda
> cd Linux-x86_64-ofi-smp-icc.cuda
> make -j16

2. Compiling NAMD at Cray XC/XE/XK

Check resources

> module load cudatookkit

> nvcc --version

cuda9.1 or above is required.

Following the following steps to compile NAMD

Download newest version source code as on SGI
Set compiling environment

> module swap PrgEnv-cray PrgEnv-intel (gnu for XE/XK)
> module load rca
> module load craype-hugepages8M
> module load fftw

Build charm++ of non-smp and smp version

for XC
> ./build charm++ gni-crayxc persistent -j16 --with-production

> ./build charm++ gni-crayxc persistent smp -j16 --with-production

for XE/XK
> ./build charm++ gemini_gni-crayxe persistent -j16 --with-production

> ./build charm++ gemini_gni-crayxe persistent smp -j16 --with-production

Test the compilations of charm++ as on SGI
Download tcl and fftw as on SGI
Compile NAMD

> ./config CRAY-XC-intel --charm-arch gni-crayxc-persistent --with-fftw3
> ./config CRAY-XC-intel.smp --charm-arch gni-crayxc-persistent-smp --with-fftw3
> cd CRAY-XC-intel
> make -j16
> cd ..
> cd CRAY-XC-intel.smp
> make -j16
> cd ..
> module load cudatookkit
> nvcc --version
> ./config CRAY-XC-intel.cuda --charm-arch gni-crayxc-smp --with-cuda --with-fftw3
> cd CRAY-XC-intel.cuda
> make -j16

3. PBS script to run NAMD on SGI

When running CUDA NAMD, always add +idlepoll to the command line. This is needed to poll the GPU for results rather than sleeping while idle. Here is the pbs script for job submission.

#!/bin/csh
#PBS -l select=4:ncpus=48:mpiprocs=48:ngpus=1


set EXEC=/p/home/kuangz/NAMD_2.14_Source/Linux-x86_64-ofi-smp-icc-cuda
set SCR=$PBS_O_WORKDIR
cd $SCR

module unload mpt
module swap compiler/intel/2018.3.222 compiler/intel/2020.0.1
module load compiler/intelmpi/2020.1.0
module load cuda/9.1

mpirun -np 4 -f $PBS_NODEFILE $EXEC/namd2 +ppn 47 +commap 0 +pemap 1-47 +idlepoll +ignoresharing +isomalloc_sync npt.conf > npt.log

end

4. PBS script to run NAMD on Cray

When running CUDA NAMD, always add +idlepoll to the command line. This is needed to poll the GPU for results rather than sleeping while idle. Here is the pbs script for job submission.

#PBS -l select=4:ncpus=22:mpiprocs=22:ngpus=1

set SCR=$PBS_O_WORKDIR
cd $SCR

module swap PrgEnv-cray PrgEnv-intel
module load rca
module swap craype-hugepages2M craype-hugepages8M
module load fftw
module load cudatoolkit
setenv HUGETLB_DEFAULT_PAGE_SIZE 8M
setenv HUGETLB_MORECORE no

aprun -n 4 -N 1 -d 22 $EXEC +idlepoll +ppn 21 npt16.conf > npt16.log

5. For OmniPath network

I have built GPU version using

./build charm++ verbs-linux-x86_64 icc smp --with-qlogic --with-production
module load cuda
./config Linux-x86_64-icc --charm-arch verbs-linux-x86_64-smp-icc --with-cuda --cuda-prefix /p/app/cuda/9.1

It can be launched by

set SCR=$PBS_O_WORKDIR
set EXEC=/p/home/kuangz/NAMD_2.14_Source/Linux-x86_64-icc.cuda
cd $SCR

module load cuda

set NODES = `cat $PBS_NODEFILE`
set NODELIST = $SCR/namd2.nodelist
echo "group main" >! $NODELIST
foreach node ( $NODES )
    echo "host $node" >> $NODELIST
end
@ npes = 2 * 2 * 23
$EXEC/charmrun ++p $npes ++ppn 23 ++nodelist $NODELIST $EXEC/namd2 +isomalloc_sync +setcpuaffinity +pemap 1-23,25-47 +commap 0,24 +idlepoll +ignoresharing stmv.namd  > stmv.log1

I have also built MPI version.

for single node

setenv CC icc ; setenv CXX icpc ; setenv F90 ifort ; setenv F77 ifort

./build charm++ multicore-linux64 iccstatic --with-production "-O3 -ip -xCORE-AVX512 -qopt-zmm-usage=high"

./config Linux-x86_64-ofi-icc.single --charm-arch multicore-linux64-iccstatic --with-mkl

make -j4
namd2 +p48 +setcupaffinity stmv.namd > test1.log

for multiple nodes

module purge
module load compiler/intel/2019.3.199
module load compiler/intelmpi/2019.3.199

setenv CC icc ; setenv CXX icpc ; setenv F90 ifort ; setenv F77 ifort

setenv MPICXX mpiicpc; setenv MPI_CXX mpiicpc; setenv I_MPI_CC icc; setenv I_MPI_CXX icpc; setenv I_MPI_F90 ifort; setenv I_MPI_F77 ifort

./build charm++ mpi-linux-x86_64 smp mpicxx --with-production "-O3 -ip -xCORE-AVX512" -DCMK_OPTIMIZE -DMPICH_IGNORE_CXX_SEEK

./config Linux-x86_64-ofi-smp-icc.mpi --charm-arch  mpi-linux-x86_64-smp-mpicxx --with-mkl

It can be launched by

set NODES=2
set NPPN=2
@ NMPI = $NODES * $NPPN
mpirun -ppn $NPPN -f $PBS_NODEFILE -np $NMPI $EXEC +ppn 23 +commap 0,24 +pemap 1-23,25-47 stmv.namd > stmv.mpi

6. For InfiniBand network

./build charm++ verbs-linux-x86_64 icc smp --with-production
./build charm++ verbs-linux-x86_64 icc smp ifort  -j8 --with-production

./config Linux-x86_64-icc --charm-arch verbs-linux-x86_64-smp-icc --with-cuda --cuda-prefix /usr/local/cuda

It can be launched by

set NODES = `cat $PBS_NODEFILE`
set NODELIST = $SCR/namd2.nodelist
echo "group main" >! $NODELIST
foreach node ( $NODES )
    echo "host $node" >> $NODELIST
end

$EXEC/charmrun ++p 152 ++ppn 19 ++nodelist $NODELIST $EXEC/namd2 +setcpuaffinity +pemap 1-19,21-39 +commap 0,20 +idlepoll +ignoresharing npt21.conf  > npt21.log

References

WVU-RC

Intel

nvidia