From 5daf09a79740dd38394827d5b30d86ac7c10296b Mon Sep 17 00:00:00 2001
From: anzhengqi <anzhengqi1@huawei.com>
Date: Wed, 7 Sep 2022 10:31:20 +0800
Subject: [PATCH] modify some network readme and scripts

---
 official/cv/octsqueeze/README.md              |  6 ++---
 official/cv/octsqueeze/README_CN.md           |  6 ++---
 .../scripts/run_train_distribute.sh           |  5 ++--
 .../nlp/tinybert/src/tinybert_for_gd_td.py    |  7 +++++
 official/recommend/ncf/README.md              | 16 +++--------
 official/recommend/ncf/scripts/run_eval.sh    | 27 -------------------
 .../recommend/ncf/scripts/run_eval_ascend.sh  |  4 ++-
 .../recommend/ncf/scripts/run_eval_gpu.sh     |  4 ++-
 official/recommend/ncf/scripts/run_train.sh   | 27 -------------------
 .../recommend/ncf/scripts/run_train_ascend.sh |  3 ++-
 research/cv/retinaface/README_CN.md           |  1 +
 11 files changed, 29 insertions(+), 77 deletions(-)
 delete mode 100644 official/recommend/ncf/scripts/run_eval.sh
 delete mode 100644 official/recommend/ncf/scripts/run_train.sh

diff --git a/official/cv/octsqueeze/README.md b/official/cv/octsqueeze/README.md
index 4b977f3e4..5bece2444 100644
--- a/official/cv/octsqueeze/README.md
+++ b/official/cv/octsqueeze/README.md
@@ -84,8 +84,8 @@ bash run_train_standalone.sh [TRAINING_DATASET_PATH] [DEVICE] [CHECKPOINT_SAVE_P
 # example: bash run_train_standalone.sh /home/ma-user/work/training_dataset/ Ascend ./ckpt/ 0
 
 # or tain OctSqueeze parallelly (8P)
-bash run_train_distribute.sh [TRAINING_DATASET_PATH] [CHECKPOINT_SAVE_PATH]
-# example: bash run_train_distribute.sh /home/ma-user/work/training_dataset/ ./ckpt/
+bash run_train_distribute.sh [TRAINING_DATASET_PATH] [CHECKPOINT_SAVE_PATH] [RANK_TABLE_FILE]
+# example: bash run_train_distribute.sh /home/ma-user/work/training_dataset/ ./ckpt/ /path/hccl_8p.json
 
 # evaluate OctSqueeze
 bash run_eval.sh [TEST_DATASET_PATH] [COMPRESSED_DATA_PATH] [RECONSTRUCTED_DATA_PATH] [MODE] [DEVICE]
@@ -190,7 +190,7 @@ Major parameters in eval.py:
   # or enter script dir, run 1P training script
   bash bash run_train_standalone.sh /home/ma-user/work/training_dataset/ Ascend ./ckpt/ 0
   # or enter script dir, run 8P training script
-  bash run_train_distribute.sh /home/ma-user/work/training_dataset/ ./ckpt/
+  bash run_train_distribute.sh /home/ma-user/work/training_dataset/ ./ckpt/ /path/hccl_8p.json
   ```
 
   After training, the loss value will be achieved as follows:
diff --git a/official/cv/octsqueeze/README_CN.md b/official/cv/octsqueeze/README_CN.md
index 6aaf67434..bee44b10b 100644
--- a/official/cv/octsqueeze/README_CN.md
+++ b/official/cv/octsqueeze/README_CN.md
@@ -87,8 +87,8 @@ bash run_train_standalone.sh [TRAINING_DATASET_PATH] [DEVICE] [CHECKPOINT_SAVE_P
 # example: bash run_train_standalone.sh /home/ma-user/work/training_dataset/ Ascend ./ckpt/ 0
 
 # 鎴栧垎甯冨紡璁粌OctSqueeze (8P)
-bash run_train_distribute.sh [TRAINING_DATASET_PATH] [CHECKPOINT_SAVE_PATH]
-# example: bash run_train_distribute.sh /home/ma-user/work/training_dataset/ ./ckpt/
+bash run_train_distribute.sh [TRAINING_DATASET_PATH] [CHECKPOINT_SAVE_PATH] [RANK_TABLE_FILE]
+# example: bash run_train_distribute.sh /home/ma-user/work/training_dataset/ ./ckpt/ /path/hccl_8p.json
 
 # 璇勪及OctSqueeze
 bash run_eval.sh [TEST_DATASET_PATH] [COMPRESSED_DATA_PATH] [RECONSTRUCTED_DATA_PATH] [MODE] [DEVICE]
@@ -194,7 +194,7 @@ eval.py涓殑涓昏鍙傛暟濡備笅锛�
   # 鎴栬繘鍏ヨ剼鏈洰褰曪紝鎵ц1P鑴氭湰
   bash bash run_train_standalone.sh /home/ma-user/work/training_dataset/ Ascend ./ckpt/ 0
   # 鎴栬繘鍏ヨ剼鏈洰褰曪紝鎵ц8P鑴氭湰
-  bash run_train_distribute.sh /home/ma-user/work/training_dataset/ ./ckpt/
+  bash run_train_distribute.sh /home/ma-user/work/training_dataset/ ./ckpt/ /path/hccl_8p.json
   ```
 
   缁忚繃璁粌鍚庯紝鎹熷け鍊煎涓嬶細
diff --git a/official/cv/octsqueeze/scripts/run_train_distribute.sh b/official/cv/octsqueeze/scripts/run_train_distribute.sh
index 036f6d683..764d206fb 100644
--- a/official/cv/octsqueeze/scripts/run_train_distribute.sh
+++ b/official/cv/octsqueeze/scripts/run_train_distribute.sh
@@ -14,9 +14,9 @@
 # limitations under the License.
 # ============================================================================
 
-if [ $# != 2 ]
+if [ $# != 3 ]
 then 
-    echo "Usage: bash run_train_distribute.sh [TRAINING_DATASET_PATH] [CHECKPOINT_SAVE_PATH]"
+    echo "Usage: bash run_train_distribute.sh [TRAINING_DATASET_PATH] [CHECKPOINT_SAVE_PATH] [RANK_TABLE_FILE]"
 exit 1
 fi
 
@@ -38,6 +38,7 @@ fi
 data_path=$1
 checkpoint=$2
 
+export RANK_TABLE_FILE=$3
 export RANK_SIZE=8
 
 current_exec_path=$(pwd)
diff --git a/official/nlp/tinybert/src/tinybert_for_gd_td.py b/official/nlp/tinybert/src/tinybert_for_gd_td.py
index b1a70e92b..076fd919f 100644
--- a/official/nlp/tinybert/src/tinybert_for_gd_td.py
+++ b/official/nlp/tinybert/src/tinybert_for_gd_td.py
@@ -212,6 +212,9 @@ class BertTrainWithLossScaleCell(nn.TrainOneStepWithLossScaleCell):
     """
     def __init__(self, network, optimizer, scale_update_cell=None):
         super(BertTrainWithLossScaleCell, self).__init__(network, optimizer, scale_update_cell)
+        self.degree = 1
+        if self.reducer_flag:
+            self.degree = get_group_size()
         self.cast = P.Cast()
 
     def construct(self,
@@ -430,7 +433,11 @@ class BertEvaluationWithLossScaleCell(nn.TrainOneStepWithLossScaleCell):
     """
     def __init__(self, network, optimizer, scale_update_cell=None):
         super(BertEvaluationWithLossScaleCell, self).__init__(network, optimizer, scale_update_cell)
+        self.degree = 1
+        if self.reducer_flag:
+            self.degree = get_group_size()
         self.cast = P.Cast()
+
     def construct(self,
                   input_ids,
                   input_mask,
diff --git a/official/recommend/ncf/README.md b/official/recommend/ncf/README.md
index 9e3c8665a..7f6ced0ba 100644
--- a/official/recommend/ncf/README.md
+++ b/official/recommend/ncf/README.md
@@ -78,7 +78,7 @@ For FP16 operators, if the input data type is FP32, the backend of MindSpore wil
 
 # [Environment Requirements](#contents)
 
-- Hardware(Ascend锛�
+- Hardware(Ascend/GPU/CPU锛�
     - Prepare hardware environment with Ascend.
 - Framework
     - [MindSpore](https://www.mindspore.cn/install/en)
@@ -94,9 +94,6 @@ After installing MindSpore via the official website, you can start training and
 #run data process
 bash scripts/run_download_dataset.sh
 
-# run training example
-bash scripts/run_train.sh
-
 # run training example on Ascend
 bash scripts/run_train_ascend.sh
 
@@ -106,9 +103,6 @@ bash scripts/run_train_gpu.sh
 # run training distribute example on Ascend
 bash scripts/run_distribute_train.sh /path/hccl.json /path/MovieLens
 
-# run evaluation example
-bash scripts/run_eval.sh
-
 # run evaluation example on Ascend
 bash scripts/run_eval_ascend.sh
 
@@ -170,10 +164,8 @@ If you want to run in modelarts, please check the official documentation of [mod
     鈹�   鈹溾攢鈹€ascend_distributed_launcher
     鈹�       鈹溾攢鈹€__init__.py                      // init file
     鈹�       鈹溾攢鈹€get_distribute_pretrain_cmd.py   // create distribute shell script
-    鈹�   鈹溾攢鈹€run_train.sh                    // shell script for train
     鈹�   鈹溾攢鈹€run_train_ascend.sh             // shell script for train on Ascend
     鈹�   鈹溾攢鈹€run_distribute_train.sh         // shell script for distribute train
-    鈹�   鈹溾攢鈹€run_eval.sh                     // shell script for evaluation
     鈹�   鈹溾攢鈹€run_eval_ascend.sh              // shell script for evaluation on Ascend
     鈹�   鈹溾攢鈹€run_train_gpu.sh                // shell script for train on GPU
     鈹�   鈹溾攢鈹€run_eval_gpu.sh                 // shell script for evaluation on GPU
@@ -233,12 +225,12 @@ Parameters for both training and evaluation can be set in config.py.
 
   ```bash
   bash scripts/run_train_gpu.sh [DATASET_PATH] [CKPT_FILE] [DEVICE_ID]
-  ```
+  ```run_train
 
 - on CPU
 
   ```bash
-  python train.py --data_path=./dataset --dataset=ml-1m --train_epochs=25 --batch_size=256 --output_path=./output/ --checkpoint_path=./checkpoint --device_target=CPU --device_id=0 --num_parallel_workers=2 > train.log 2>&1 &
+  pytrun_trainth=./dataset --dataset=ml-1m --train_epochs=25 --batch_size=256 --output_path=./output/ --crun_trainpoint --device_target=CPU --device_id=0 --num_parallel_workers=2 > train.log 2>&1 &
   ```
 
   The python command above will run in the background, you can view the results through the file `train.log`. After training, you'll get some checkpoint files under the script folder by default. The loss value will be achieved as follows:
@@ -284,7 +276,7 @@ Parameters for both training and evaluation can be set in config.py.
 - evaluation on ml-1m dataset when running on CPU
 
   ```bash
-  python eval.py --data_path=./dataset --dataset=ml-1m --eval_batch_size=160000 --output_path=./output/ --eval_file_name=eval.log --checkpoint_file_path=./ckpt --device_target=CPU --device_id=0 > log.txt 2>&1 &
+  python eval.py --data_path=./dataset --dataset=ml-1m --eval_batch_size=160000 --output_path=./output/ --eval_file_name=eval.log --checkpoint_file_path=./ckpt --device_target=CPU --device_id=0 > eval.log 2>&1 &
   ```
 
   The accuracy of the test dataset will be as follows:
diff --git a/official/recommend/ncf/scripts/run_eval.sh b/official/recommend/ncf/scripts/run_eval.sh
deleted file mode 100644
index b7a054adb..000000000
--- a/official/recommend/ncf/scripts/run_eval.sh
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/bin/bash
-# Copyright 2022 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-if [ $# != 2 ]
-then
-    echo "Please run the script as: "
-    echo "bash scripts/run_eval.sh DATASET_PATH CKPT_FILE"
-    echo "for example: bash scripts/run_eval.sh /dataset_path /ncf.ckpt"
-exit 1
-fi
-
-
-data_path=$1
-ckpt_file=$2
-python ./eval.py --data_path $data_path --dataset 'ml-1m'  --eval_batch_size 160000 --output_path './output/' --eval_file_name 'eval.log' --checkpoint_file_path $ckpt_file > log.txt 2>&1 &
diff --git a/official/recommend/ncf/scripts/run_eval_ascend.sh b/official/recommend/ncf/scripts/run_eval_ascend.sh
index 7a1c95ec3..7fbb068fd 100644
--- a/official/recommend/ncf/scripts/run_eval_ascend.sh
+++ b/official/recommend/ncf/scripts/run_eval_ascend.sh
@@ -24,4 +24,6 @@ fi
 data_path=$1
 ckpt_file=$2
 export DEVICE_ID=$3
-python ./eval.py --data_path $data_path --dataset 'ml-1m'  --eval_batch_size 160000 --output_path './output/' --eval_file_name 'eval.log' --checkpoint_file_path $ckpt_file --device_target=Ascend --device_id $DEVICE_ID > log.txt 2>&1 &
+python ./eval.py --data_path $data_path --dataset 'ml-1m'  --eval_batch_size 160000 \
+    --output_path './output/' --eval_file_name 'eval.log' --checkpoint_file_path $ckpt_file \
+    --device_target=Ascend --device_id $DEVICE_ID > eval.log 2>&1 &
diff --git a/official/recommend/ncf/scripts/run_eval_gpu.sh b/official/recommend/ncf/scripts/run_eval_gpu.sh
index e253b6fa0..357b7677c 100644
--- a/official/recommend/ncf/scripts/run_eval_gpu.sh
+++ b/official/recommend/ncf/scripts/run_eval_gpu.sh
@@ -25,4 +25,6 @@ fi
 data_path=$1
 ckpt_file=$2
 export CUDA_VISIBLE_DEVICES=$3
-python ./eval.py --data_path $data_path --dataset 'ml-1m'  --eval_batch_size 160000 --output_path './output/' --eval_file_name 'eval.log' --checkpoint_file_path $ckpt_file --device_target=GPU --device_id=0 > log.txt 2>&1 &
+python ./eval.py --data_path $data_path --dataset 'ml-1m'  --eval_batch_size 160000 \
+    --output_path './output/' --eval_file_name 'eval.log' --checkpoint_file_path $ckpt_file \
+    --device_target=GPU --device_id=0 > eval.log 2>&1 &
diff --git a/official/recommend/ncf/scripts/run_train.sh b/official/recommend/ncf/scripts/run_train.sh
deleted file mode 100644
index f7f947a93..000000000
--- a/official/recommend/ncf/scripts/run_train.sh
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/bin/bash
-# Copyright 2022 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-
-if [ $# != 2 ]
-then
-    echo "Please run the script as: "
-    echo "bash scripts/run_train.sh DATASET_PATH CKPT_FILE"
-    echo "for example: bash scripts/run_train.sh /dataset_path /ncf.ckpt"
-exit 1
-fi
-
-data_path=$1
-ckpt_file=$2
-python ./train.py --data_path $data_path --dataset 'ml-1m'  --train_epochs 20 --batch_size 256 --output_path './output/' --checkpoint_path $ckpt_file > train.log 2>&1 &
diff --git a/official/recommend/ncf/scripts/run_train_ascend.sh b/official/recommend/ncf/scripts/run_train_ascend.sh
index 43626edb2..137f4ebff 100644
--- a/official/recommend/ncf/scripts/run_train_ascend.sh
+++ b/official/recommend/ncf/scripts/run_train_ascend.sh
@@ -23,4 +23,5 @@ fi
 
 data_path=$1
 ckpt_file=$2
-python ./train.py --data_path $data_path --dataset 'ml-1m'  --train_epochs 25 --batch_size 256 --output_path './output/' --checkpoint_path $ckpt_file --device_target=Ascend > train.log 2>&1 &
+python ./train.py --data_path $data_path --dataset 'ml-1m'  --train_epochs 25 --batch_size 256 \
+    --output_path './output/' --checkpoint_path $ckpt_file --device_target=Ascend > train.log 2>&1 &
diff --git a/research/cv/retinaface/README_CN.md b/research/cv/retinaface/README_CN.md
index 251836bbf..8043f40a1 100644
--- a/research/cv/retinaface/README_CN.md
+++ b/research/cv/retinaface/README_CN.md
@@ -274,6 +274,7 @@ RetinaFace鍙互浣跨敤ResNet50鎴朚obileNet0.25楠ㄥ共鎻愬彇鍥惧儚鐗瑰緛杩涜妫€
 - Ascend澶勭悊鍣ㄧ幆澧冭繍琛岋紙浣跨敤ResNet50浣滀负backbone锛�
 
   ```bash
+  # 灏唖rc/config.py鏂囦欢涓璶npu鍙傛暟鏀逛负1
   python train.py --backbone_name 'ResNet50' > train.log 2>&1 &
   OR
   bash ./scripts/run_standalone_train_ascend.sh
-- 
GitLab