Advanced Computing Platform for Theoretical Physics

Commit 782f9665 authored by Lei Wang's avatar Lei Wang
Browse files

is it working?

parent eca2ddbc
#!/bin/bash -l
#SBATCH --partition=v100
#SBATCH --gres=gpu:4
##SBATCH --ntasks=2
##SBATCH --ntasks-per-node=1
#SBATCH --nodes=2
#SBATCH --time=1:00:00
#SBATCH --job-name=test
#SBATCH --output=output_%j.log
#SBATCH --error=error_%j.log
echo "The current job ID is $SLURM_JOB_ID"
echo "Running on $SLURM_JOB_NUM_NODES nodes:"
echo $SLURM_JOB_NODELIST
echo "Using $SLURM_NTASKS_PER_NODE tasks per node"
echo "A total of $SLURM_NTASKS tasks is used"
num_hosts=$SLURM_JOB_NUM_NODES
nodes=$(scontrol show hostnames $SLURM_JOB_NODELIST) # Getting the node names
nodes_array=( $nodes )
node1=${nodes_array[0]}
ip=$(srun --nodes=1 --ntasks=1 -w $node1 hostname --ip-address) # Making address
port='6379'
ip_address=$ip:$port
export $ip_address
echo Job started at `date`
for ((i=0; i<$num_hosts;i++))
do
echo "host $i CUDA devices $CUDA_VISIBLE_DEVICES"
node=${nodes_array[$i]}
srun --nodes=1 --ntasks=1 -w $node python multihost_pmap.py --server_addr=$ip_address --num_hosts=$num_hosts --host_idx=$i
done
echo Job finished at `date`
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment