modify some network readme and scripts

5daf09a7 · anzhengqi · 8868e294 · 5daf09a7 · 5daf09a7 · 5daf09a7
Commit 5daf09a7 authored 2 years ago by anzhengqi
--- a/official/cv/octsqueeze/README.md
+++ b/official/cv/octsqueeze/README.md
@@ -84,8 +84,8 @@ bash run_train_standalone.sh [TRAINING_DATASET_PATH] [DEVICE] [CHECKPOINT_SAVE_P
 # example: bash run_train_standalone.sh /home/ma-user/work/training_dataset/ Ascend ./ckpt/ 0

 # or tain OctSqueeze parallelly (8P)
-bash run_train_distribute.sh [TRAINING_DATASET_PATH] [CHECKPOINT_SAVE_PATH]
-# example: bash run_train_distribute.sh /home/ma-user/work/training_dataset/ ./ckpt/
+bash run_train_distribute.sh [TRAINING_DATASET_PATH] [CHECKPOINT_SAVE_PATH] [RANK_TABLE_FILE]
+# example: bash run_train_distribute.sh /home/ma-user/work/training_dataset/ ./ckpt/ /path/hccl_8p.json

 # evaluate OctSqueeze
 bash run_eval.sh [TEST_DATASET_PATH] [COMPRESSED_DATA_PATH] [RECONSTRUCTED_DATA_PATH] [MODE] [DEVICE]
@@ -190,7 +190,7 @@ Major parameters in eval.py:
  # or enter script dir, run 1P training script
  bash bash run_train_standalone.sh /home/ma-user/work/training_dataset/ Ascend ./ckpt/ 0
  # or enter script dir, run 8P training script
-  bash run_train_distribute.sh /home/ma-user/work/training_dataset/ ./ckpt/
+  bash run_train_distribute.sh /home/ma-user/work/training_dataset/ ./ckpt/ /path/hccl_8p.json
  ```

  After training, the loss value will be achieved as follows:

--- a/official/cv/octsqueeze/README_CN.md
+++ b/official/cv/octsqueeze/README_CN.md
@@ -87,8 +87,8 @@ bash run_train_standalone.sh [TRAINING_DATASET_PATH] [DEVICE] [CHECKPOINT_SAVE_P
 # example: bash run_train_standalone.sh /home/ma-user/work/training_dataset/ Ascend ./ckpt/ 0

 # 或分布式训练OctSqueeze (8P)
-bash run_train_distribute.sh [TRAINING_DATASET_PATH] [CHECKPOINT_SAVE_PATH]
-# example: bash run_train_distribute.sh /home/ma-user/work/training_dataset/ ./ckpt/
+bash run_train_distribute.sh [TRAINING_DATASET_PATH] [CHECKPOINT_SAVE_PATH] [RANK_TABLE_FILE]
+# example: bash run_train_distribute.sh /home/ma-user/work/training_dataset/ ./ckpt/ /path/hccl_8p.json

 # 评估OctSqueeze
 bash run_eval.sh [TEST_DATASET_PATH] [COMPRESSED_DATA_PATH] [RECONSTRUCTED_DATA_PATH] [MODE] [DEVICE]
@@ -194,7 +194,7 @@ eval.py中的主要参数如下：
  # 或进入脚本目录，执行1P脚本
  bash bash run_train_standalone.sh /home/ma-user/work/training_dataset/ Ascend ./ckpt/ 0
  # 或进入脚本目录，执行8P脚本
-  bash run_train_distribute.sh /home/ma-user/work/training_dataset/ ./ckpt/
+  bash run_train_distribute.sh /home/ma-user/work/training_dataset/ ./ckpt/ /path/hccl_8p.json
  ```

  经过训练后，损失值如下：

--- a/official/cv/octsqueeze/scripts/run_train_distribute.sh
+++ b/official/cv/octsqueeze/scripts/run_train_distribute.sh
@@ -14,9 +14,9 @@
 # limitations under the License.
 # ============================================================================

-if [ $# != 2 ]
+if [ $# != 3 ]
 then 
-    echo "Usage: bash run_train_distribute.sh [TRAINING_DATASET_PATH] [CHECKPOINT_SAVE_PATH]"
+    echo "Usage: bash run_train_distribute.sh [TRAINING_DATASET_PATH] [CHECKPOINT_SAVE_PATH] [RANK_TABLE_FILE]"
 exit 1
 fi

@@ -38,6 +38,7 @@ fi
 data_path=$1
 checkpoint=$2

+export RANK_TABLE_FILE=$3
 export RANK_SIZE=8

 current_exec_path=$(pwd)

--- a/official/nlp/tinybert/src/tinybert_for_gd_td.py
+++ b/official/nlp/tinybert/src/tinybert_for_gd_td.py
@@ -212,6 +212,9 @@ class BertTrainWithLossScaleCell(nn.TrainOneStepWithLossScaleCell):
    """
    def __init__(self, network, optimizer, scale_update_cell=None):
        super(BertTrainWithLossScaleCell, self).__init__(network, optimizer, scale_update_cell)
+        self.degree = 1
+        if self.reducer_flag:
+            self.degree = get_group_size()
        self.cast = P.Cast()

    def construct(self,
@@ -430,7 +433,11 @@ class BertEvaluationWithLossScaleCell(nn.TrainOneStepWithLossScaleCell):
    """
    def __init__(self, network, optimizer, scale_update_cell=None):
        super(BertEvaluationWithLossScaleCell, self).__init__(network, optimizer, scale_update_cell)
+        self.degree = 1
+        if self.reducer_flag:
+            self.degree = get_group_size()
        self.cast = P.Cast()
+
    def construct(self,
                  input_ids,
                  input_mask,

--- a/official/recommend/ncf/README.md
+++ b/official/recommend/ncf/README.md
@@ -78,7 +78,7 @@ For FP16 operators, if the input data type is FP32, the backend of MindSpore wil

 # [Environment Requirements](#contents)

- Hardware(Ascend）
+- Hardware(Ascend/GPU/CPU）
    - Prepare hardware environment with Ascend.
 - Framework
    - [MindSpore](https://www.mindspore.cn/install/en)
@@ -94,9 +94,6 @@ After installing MindSpore via the official website, you can start training and
 #run data process
 bash scripts/run_download_dataset.sh

-# run training example
-bash scripts/run_train.sh
-
 # run training example on Ascend
 bash scripts/run_train_ascend.sh

@@ -106,9 +103,6 @@ bash scripts/run_train_gpu.sh
 # run training distribute example on Ascend
 bash scripts/run_distribute_train.sh /path/hccl.json /path/MovieLens

-# run evaluation example
-bash scripts/run_eval.sh
-
 # run evaluation example on Ascend
 bash scripts/run_eval_ascend.sh

@@ -170,10 +164,8 @@ If you want to run in modelarts, please check the official documentation of [mod
    │   ├──ascend_distributed_launcher
    │       ├──__init__.py                      // init file
    │       ├──get_distribute_pretrain_cmd.py   // create distribute shell script
-    │   ├──run_train.sh                    // shell script for train
    │   ├──run_train_ascend.sh             // shell script for train on Ascend
    │   ├──run_distribute_train.sh         // shell script for distribute train
-    │   ├──run_eval.sh                     // shell script for evaluation
    │   ├──run_eval_ascend.sh              // shell script for evaluation on Ascend
    │   ├──run_train_gpu.sh                // shell script for train on GPU
    │   ├──run_eval_gpu.sh                 // shell script for evaluation on GPU
@@ -233,12 +225,12 @@ Parameters for both training and evaluation can be set in config.py.

  ```bash
  bash scripts/run_train_gpu.sh [DATASET_PATH] [CKPT_FILE] [DEVICE_ID]
-  ```
+  ```run_train

 - on CPU

  ```bash
-  python train.py --data_path=./dataset --dataset=ml-1m --train_epochs=25 --batch_size=256 --output_path=./output/ --checkpoint_path=./checkpoint --device_target=CPU --device_id=0 --num_parallel_workers=2 > train.log 2>&1 &
+  pytrun_trainth=./dataset --dataset=ml-1m --train_epochs=25 --batch_size=256 --output_path=./output/ --crun_trainpoint --device_target=CPU --device_id=0 --num_parallel_workers=2 > train.log 2>&1 &
  ```

  The python command above will run in the background, you can view the results through the file `train.log`. After training, you'll get some checkpoint files under the script folder by default. The loss value will be achieved as follows:
@@ -284,7 +276,7 @@ Parameters for both training and evaluation can be set in config.py.
 - evaluation on ml-1m dataset when running on CPU

  ```bash
-  python eval.py --data_path=./dataset --dataset=ml-1m --eval_batch_size=160000 --output_path=./output/ --eval_file_name=eval.log --checkpoint_file_path=./ckpt --device_target=CPU --device_id=0 > log.txt 2>&1 &
+  python eval.py --data_path=./dataset --dataset=ml-1m --eval_batch_size=160000 --output_path=./output/ --eval_file_name=eval.log --checkpoint_file_path=./ckpt --device_target=CPU --device_id=0 > eval.log 2>&1 &
  ```

  The accuracy of the test dataset will be as follows:

--- a/official/recommend/ncf/scripts/run_eval.sh
+++ b/official/recommend/ncf/scripts/run_eval.sh
-#!/bin/bash
-# Copyright 2022 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-if [ $# != 2 ]
-then
-    echo "Please run the script as: "
-    echo "bash scripts/run_eval.sh DATASET_PATH CKPT_FILE"
-    echo "for example: bash scripts/run_eval.sh /dataset_path /ncf.ckpt"
-exit 1
-fi
-
-
-data_path=$1
-ckpt_file=$2
-python ./eval.py --data_path $data_path --dataset 'ml-1m'  --eval_batch_size 160000 --output_path './output/' --eval_file_name 'eval.log' --checkpoint_file_path $ckpt_file > log.txt 2>&1 &
--- a/official/recommend/ncf/scripts/run_eval_ascend.sh
+++ b/official/recommend/ncf/scripts/run_eval_ascend.sh
@@ -24,4 +24,6 @@ fi
 data_path=$1
 ckpt_file=$2
 export DEVICE_ID=$3
-python ./eval.py --data_path $data_path --dataset 'ml-1m'  --eval_batch_size 160000 --output_path './output/' --eval_file_name 'eval.log' --checkpoint_file_path $ckpt_file --device_target=Ascend --device_id $DEVICE_ID > log.txt 2>&1 &
+python ./eval.py --data_path $data_path --dataset 'ml-1m'  --eval_batch_size 160000 \
+    --output_path './output/' --eval_file_name 'eval.log' --checkpoint_file_path $ckpt_file \
+    --device_target=Ascend --device_id $DEVICE_ID > eval.log 2>&1 &
--- a/official/recommend/ncf/scripts/run_eval_gpu.sh
+++ b/official/recommend/ncf/scripts/run_eval_gpu.sh
@@ -25,4 +25,6 @@ fi
 data_path=$1
 ckpt_file=$2
 export CUDA_VISIBLE_DEVICES=$3
-python ./eval.py --data_path $data_path --dataset 'ml-1m'  --eval_batch_size 160000 --output_path './output/' --eval_file_name 'eval.log' --checkpoint_file_path $ckpt_file --device_target=GPU --device_id=0 > log.txt 2>&1 &
+python ./eval.py --data_path $data_path --dataset 'ml-1m'  --eval_batch_size 160000 \
+    --output_path './output/' --eval_file_name 'eval.log' --checkpoint_file_path $ckpt_file \
+    --device_target=GPU --device_id=0 > eval.log 2>&1 &
--- a/official/recommend/ncf/scripts/run_train.sh
+++ b/official/recommend/ncf/scripts/run_train.sh
-#!/bin/bash
-# Copyright 2022 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-
-if [ $# != 2 ]
-then
-    echo "Please run the script as: "
-    echo "bash scripts/run_train.sh DATASET_PATH CKPT_FILE"
-    echo "for example: bash scripts/run_train.sh /dataset_path /ncf.ckpt"
-exit 1
-fi
-
-data_path=$1
-ckpt_file=$2
-python ./train.py --data_path $data_path --dataset 'ml-1m'  --train_epochs 20 --batch_size 256 --output_path './output/' --checkpoint_path $ckpt_file > train.log 2>&1 &
--- a/official/recommend/ncf/scripts/run_train_ascend.sh
+++ b/official/recommend/ncf/scripts/run_train_ascend.sh
@@ -23,4 +23,5 @@ fi

 data_path=$1
 ckpt_file=$2
-python ./train.py --data_path $data_path --dataset 'ml-1m'  --train_epochs 25 --batch_size 256 --output_path './output/' --checkpoint_path $ckpt_file --device_target=Ascend > train.log 2>&1 &
+python ./train.py --data_path $data_path --dataset 'ml-1m'  --train_epochs 25 --batch_size 256 \
+    --output_path './output/' --checkpoint_path $ckpt_file --device_target=Ascend > train.log 2>&1 &
--- a/research/cv/retinaface/README_CN.md
+++ b/research/cv/retinaface/README_CN.md
@@ -274,6 +274,7 @@ RetinaFace可以使用ResNet50或MobileNet0.25骨干提取图像特征进行检
 - Ascend处理器环境运行（使用ResNet50作为backbone）

  ```bash
+  # 将src/config.py文件中nnpu参数改为1
  python train.py --backbone_name 'ResNet50' > train.log 2>&1 &
  OR
  bash ./scripts/run_standalone_train_ascend.sh