diff --git a/frame_benchmark/pytorch/dynamic/PaddleVideo/scripts/TSM/N1C1/TSM_bs30_fp32_DP.sh b/frame_benchmark/pytorch/dynamic/PaddleVideo/scripts/TSM/N1C1/TSM_bs30_fp32_DP.sh index 0196644890..36625c0bb8 100644 --- a/frame_benchmark/pytorch/dynamic/PaddleVideo/scripts/TSM/N1C1/TSM_bs30_fp32_DP.sh +++ b/frame_benchmark/pytorch/dynamic/PaddleVideo/scripts/TSM/N1C1/TSM_bs30_fp32_DP.sh @@ -6,4 +6,5 @@ run_mode=DP device_num=N1C1 bash prepare.sh; -bash run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} 2>&1; +export CUDA_VISIBLE_DEVICES=0 +bash run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} 2>&1; \ No newline at end of file diff --git a/frame_benchmark/pytorch/dynamic/PaddleVideo/scripts/TSM/N1C8/TSM_bs30_fp32_DP.sh b/frame_benchmark/pytorch/dynamic/PaddleVideo/scripts/TSM/N1C8/TSM_bs30_fp32_DP.sh index 9e82e038c4..6d9fc064d5 100644 --- a/frame_benchmark/pytorch/dynamic/PaddleVideo/scripts/TSM/N1C8/TSM_bs30_fp32_DP.sh +++ b/frame_benchmark/pytorch/dynamic/PaddleVideo/scripts/TSM/N1C8/TSM_bs30_fp32_DP.sh @@ -6,4 +6,4 @@ run_mode=DP device_num=N1C8 bash prepare.sh; -bash run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} 2>&1; +bash run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} 2>&1; \ No newline at end of file diff --git a/frame_benchmark/pytorch/dynamic/PaddleVideo/scripts/TSM/N4C32/TSM_bs30_fp32_DP.sh b/frame_benchmark/pytorch/dynamic/PaddleVideo/scripts/TSM/N4C32/TSM_bs30_fp32_DP.sh new file mode 100644 index 0000000000..58e3a8b72f --- /dev/null +++ b/frame_benchmark/pytorch/dynamic/PaddleVideo/scripts/TSM/N4C32/TSM_bs30_fp32_DP.sh @@ -0,0 +1,16 @@ +model_item=TSM +bs_item=30 +fp_item=fp32 +run_process_type=MultiP +run_mode=DP +device_num=N4C32 +max_epoch=1 +num_workers=4 + +node_num=${PADDLE_TRAINERS_NUM} +node_rank=${PADDLE_TRAINER_ID} +master_addr=${POD_0_IP} +master_port=8877 + +bash prepare.sh; +bash run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_epoch} ${num_workers} ${node_num} ${node_rank} ${master_addr} ${master_port} 2>&1; \ No newline at end of file diff --git a/frame_benchmark/pytorch/dynamic/PaddleVideo/scripts/TSM/benchmark_common/prepare.sh b/frame_benchmark/pytorch/dynamic/PaddleVideo/scripts/TSM/benchmark_common/prepare.sh index 6a30074d3c..3f3999f8c2 100644 --- a/frame_benchmark/pytorch/dynamic/PaddleVideo/scripts/TSM/benchmark_common/prepare.sh +++ b/frame_benchmark/pytorch/dynamic/PaddleVideo/scripts/TSM/benchmark_common/prepare.sh @@ -4,14 +4,11 @@ echo "*******prepare benchmark start ***********" pip install -U pip echo `pip --version` -python ${BENCHMARK_ROOT}/paddlecloud/file_upload_download.py \ - --remote-path frame_benchmark/pytorch_req/pytorch_191/ \ - --local-path ./ \ - --mode download -ls + # pip install torch==1.9.1 -i https://pypi.tuna.tsinghua.edu.cn/simple -pip install torch-1.9.1-cp37-cp37m-manylinux1_x86_64.whl -i https://pypi.tuna.tsinghua.edu.cn/simple -pip install torchvision==0.10.1 -i https://pypi.tuna.tsinghua.edu.cn/simple +pip install https://paddle-wheel.bj.bcebos.com/benchmark/torch-1.12.0%2Bcu113-cp37-cp37m-linux_x86_64.whl +pip install https://paddle-wheel.bj.bcebos.com/benchmark/torchvision-0.13.0%2Bcu113-cp37-cp37m-linux_x86_64.whl + pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple ################################# 准备训练数据 如: diff --git a/frame_benchmark/pytorch/dynamic/PaddleVideo/scripts/TSM/benchmark_common/run_benchmark.sh b/frame_benchmark/pytorch/dynamic/PaddleVideo/scripts/TSM/benchmark_common/run_benchmark.sh index f7cf80a377..65ac0fde97 100644 --- a/frame_benchmark/pytorch/dynamic/PaddleVideo/scripts/TSM/benchmark_common/run_benchmark.sh +++ b/frame_benchmark/pytorch/dynamic/PaddleVideo/scripts/TSM/benchmark_common/run_benchmark.sh @@ -17,8 +17,15 @@ function _set_params(){ convergence_key="" # (可选)解析日志,筛选出收敛数据所在行的关键字 如:convergence_key="loss:" max_epoch=${7:-"2"} # (可选)需保证模型执行时间在5分钟内,需要修改代码提前中断的直接提PR 合入套件 或是max_epoch num_workers=${8:-"3"} # (可选) - # 以下为通用拼接log路径,无特殊可不用修改 - model_name=${model_item}_bs${base_batch_size}_${fp_item}_${run_mode} # (必填) 切格式不要改动,与平台页面展示对齐 + + # Added for distributed training + node_num=${9:-"2"} #(可选) 节点数量 + node_rank=${10:-"0"} # (可选) 节点rank + master_addr=${11:-"127.0.0.1"} # (可选) 主节点ip地址 + master_port=${12:-"1928"} # (可选) 主节点端口号 + + # 以下为通用拼接log路径,无特殊可不用修改 + model_name=${model_item}_bs${base_batch_size}_${fp_item}_${run_mode} # (必填) 切格式不要改动,与平台页面展示对齐 device=${CUDA_VISIBLE_DEVICES//,/ } arr=(${device}) num_gpu_devices=${#arr[*]} @@ -54,7 +61,11 @@ function _train(){ case ${run_process_type} in SingleP) train_cmd="python main.py ${train_options}" ;; MultiP) - train_cmd="python main.py ${train_options}" ;; + if [ ${device_num:3} = '8' ];then + train_cmd="python -m torch.distributed.launch --nproc_per_node=8 main.py --parallel ${train_options}" + elif [ ${device_num:3} = '32' ];then + train_cmd="python -m torch.distributed.launch --nnodes=${node_num} --node_rank=${node_rank} --master_addr=${master_addr} --master_port=${master_port} --nproc_per_node=8 main.py --parallel ${train_options}" + fi ;; *) echo "choose run_mode(SingleP or MultiP)"; exit 1; esac # 以下为通用执行命令,无特殊可不用修改