From 243f817c501ddb42fa813d39ed1a6e6cd80d46ab Mon Sep 17 00:00:00 2001
From: 185******25 <liuzhicheng15@huawei.com>
Date: Wed, 6 Jul 2022 11:57:16 +0000
Subject: [PATCH] update official/cv/resnet/README_CN.md.

update official/cv/resnet/README_CN.md.

add ascend scripts for scop

update official/cv/resnet/golden_stick/pruner/scop/train.py.

update official/cv/resnet/golden_stick/pruner/scop/train.py.

update official/cv/resnet/golden_stick/pruner/scop/train.py.

update official/cv/resnet/golden_stick/pruner/scop/train.py.

update official/cv/resnet/golden_stick/scripts/run_distribute_train.sh.

update official/cv/resnet/golden_stick/scripts/run_standalone_train.sh.

update official/cv/resnet/README_CN.md.

update official/cv/resnet/golden_stick/scripts/run_distribute_train.sh.

update official/cv/resnet/golden_stick/pruner/scop/resnet50_cifar10_config.yaml.

update official/cv/resnet/golden_stick/pruner/scop/train.py.

update official/cv/resnet/golden_stick/pruner/scop/train.py.

update official/cv/resnet/golden_stick/pruner/scop/resnet50_cifar10_config.yaml.

update official/cv/resnet/golden_stick/scripts/run_distribute_train.sh.

update official/cv/resnet/golden_stick/scripts/run_distribute_train.sh.

update official/cv/resnet/golden_stick/scripts/run_distribute_train.sh.

fix scop

fix

fix

update official/cv/resnet/golden_stick/pruner/scop/train.py.

update official/cv/resnet/golden_stick/pruner/scop/train.py.

update official/cv/resnet/golden_stick/pruner/scop/train.py.

update official/cv/resnet/golden_stick/pruner/scop/train.py.

fix CI

fix
---
 official/cv/resnet/README_CN.md               |  49 ++++++-
 .../pruner/scop/resnet50_cifar10_config.yaml  |  14 +-
 .../resnet/golden_stick/pruner/scop/train.py  |  40 ++++--
 .../scripts/run_distribute_train.sh           | 130 ++++++++++++++++++
 .../scripts/run_standalone_train.sh           |  98 +++++++++++++
 5 files changed, 311 insertions(+), 20 deletions(-)
 create mode 100644 official/cv/resnet/golden_stick/scripts/run_distribute_train.sh
 create mode 100644 official/cv/resnet/golden_stick/scripts/run_standalone_train.sh

diff --git a/official/cv/resnet/README_CN.md b/official/cv/resnet/README_CN.md
index d84ed6e00..538968dcd 100644
--- a/official/cv/resnet/README_CN.md
+++ b/official/cv/resnet/README_CN.md
@@ -775,12 +775,12 @@ Total data: 50000, top1 accuracy: 0.76844, top5 accuracy: 0.93522.
 
 ## 璁粌杩囩▼
 
-| **绠楁硶**  | SimQAT | SLB | SCOP |
+| **绠楁硶**  | SimQAT | SCOP | SLB |
 | --------- | ------ | --- | ---- |
 | **鏀寔鐨勫悗绔�**  | GPU | GPU銆丄scend | GPU |
 | **鏄惁鏀寔棰勮缁�** | 鏀寔鍔犺浇棰勮缁僣kpt | 蹇呴』鎻愪緵棰勮缁僣kpt | 绠楁硶鍘熺悊涓婃棤娉曞鐢ㄥ師ckpt锛屾棤娉曞姞杞介璁粌ckpt |
 | **鏄惁鏀寔缁缁�** | 鏀寔 | 鏀寔 | 鏀寔 |
-| **鏄惁鏀寔澶氬崱璁粌** | 鏀寔 | 涓嶆敮鎸� | 鏀寔 |
+| **鏄惁鏀寔澶氬崱璁粌** | 鏀寔 | 鏀寔 | 涓嶆敮鎸� |
 
 - 棰勮缁冩槸鎸囧厛涓嶅簲鐢ㄧ畻娉曪紝鍏堣缁冩敹鏁涗竴涓叏绮惧害鐨勭綉缁溿€傞璁粌鑾峰緱鐨刢heckpoint鏂囦欢琚敤浜庡悗缁簲鐢ㄧ畻娉曞悗鐨勮缁冦€�
 - 缁缁冩槸鎸囧簲鐢ㄧ畻娉曞悗璁粌缃戠粶锛屽湪璁粌杩囩▼涓腑鏂缁冿紝鍚庣画浠庝腑鏂鐨刢kpt缁х画杩涜璁粌銆�
@@ -830,6 +830,26 @@ cd ./golden_stick/scripts/
 bash run_distribute_train_gpu.sh ../pruner/scop/ ../pruner/scop/resnet50_cifar10_config.yaml /path/to/dataset
 ```
 
+### Ascend澶勭悊鍣ㄧ幆澧冭繍琛�
+
+```text
+# 鍒嗗竷寮忚缁�
+cd ./golden_stick/scripts/
+# PYTHON_PATH 琛ㄧず闇€瑕佸簲鐢ㄧ殑绠楁硶鐨�'train.py'鑴氭湰鎵€鍦ㄧ殑鐩綍銆�
+bash run_distribute_train.sh [RANK_TABLE_FILE] [PYTHON_PATH] [DATASET_PATH] [CONFIG_PATH] [PRETRAINED_CKPT_PATH](optional)
+
+# 鍒嗗竷寮忚缁冪ず渚�(SCOP绠楁硶浣跨敤澶氬崱璁粌)
+bash run_standalone_train.sh /path/to/rank_table_file ../pruner/scop/ ../pruner/scop/resnet50_cifar10_config.yaml /path/to/dataset
+
+# 鍗曟満璁粌
+cd ./golden_stick/scripts/
+# PYTHON_PATH 琛ㄧず闇€瑕佸簲鐢ㄧ殑绠楁硶鐨�'train.py'鑴氭湰鎵€鍦ㄧ殑鐩綍銆�
+bash run_standalone_train.sh [PYTHON_PATH] [CONFIG_FILE] [DATASET_PATH] [CKPT_TYPE](optional) [CKPT_PATH](optional)
+
+# 鍗曟満璁粌绀轰緥(SCOP绠楁硶浣跨敤鍗曞崱璁粌)
+bash run_standalone_train_ascend.sh ../pruner/scop/ ../pruner/scop/resnet50_cifar10_config.yaml /path/to/dataset
+```
+
 ## 璇勪及杩囩▼
 
 ### GPU澶勭悊鍣ㄧ幆澧冭繍琛�
@@ -850,10 +870,27 @@ bash run_eval_gpu.sh ../quantization/simqat/ ../quantization/simqat/resnet50_cif
 bash run_eval_gpu.sh ../quantization/slb/ ../quantization/slb/resnet18_cifar10_config.yaml /path/to/dataset /path/to/ckpt
 ```
 
+### Ascend澶勭悊鍣ㄧ幆澧冭繍琛�
+
+```text
+# 璇勪及
+cd ./golden_stick/scripts/
+# PYTHON_PATH 琛ㄧず闇€瑕佸簲鐢ㄧ殑绠楁硶鐨�'eval.py'鑴氭湰鎵€鍦ㄧ殑鐩綍銆�
+bash run_eval_ascend.sh [PYTHON_PATH] [CONFIG_FILE] [DATASET_PATH] [CHECKPOINT_PATH]
+```
+
+```text
+# 璇勪及绀轰緥
+cd ./golden_stick/scripts/
+bash run_eval_gpu.sh ../pruner/scop/ ../pruner/scop/resnet50_cifar10_config.yaml /path/to/dataset /path/to/ckpt
+```
+
 ### 缁撴灉
 
 璇勪及缁撴灉淇濆瓨鍦ㄧず渚嬭矾寰勪腑锛屾枃浠跺す鍚嶄负鈥渆val鈥濄€傛偍鍙湪姝よ矾寰勪笅鐨勬棩蹇楁壘鍒板涓嬬粨鏋滐細
 
+#### GPU缁撴灉
+
 - 浣跨敤SimQAT绠楁硶閲忓寲ResNet50锛屽苟浣跨敤CIFAR-10鏁版嵁闆嗚瘎浼帮細
 
 ```text
@@ -884,6 +921,14 @@ result:{'top_1_accuracy': 0.9503205128205128, 'top_5_accuracy': 0.99669471153846
 result:{'top_1_accuracy': 0.9485176282051282, 'top_5_accuracy': 0.9965945512820513} ckpt=~/resnet18_cifar10/train_parallel/resnet-100_1562.ckpt
 ```
 
+#### Ascend缁撴灉
+
+- 浣跨敤SCOP绠楁硶鍓灊ResNet50锛屽苟浣跨敤CIFAR-10鏁版嵁闆嗚瘎浼帮細
+
+```text
+result:{'top_1_accuracy': 0.928385416666666} prune_rate=0.45 ckpt=~/resnet50_cifar10/train_parallel0/resnet-400_195.ckpt
+```
+
 # 妯″瀷鎻忚堪
 
 ## 鎬ц兘
diff --git a/official/cv/resnet/golden_stick/pruner/scop/resnet50_cifar10_config.yaml b/official/cv/resnet/golden_stick/pruner/scop/resnet50_cifar10_config.yaml
index 8c50785bf..22ae3efe9 100644
--- a/official/cv/resnet/golden_stick/pruner/scop/resnet50_cifar10_config.yaml
+++ b/official/cv/resnet/golden_stick/pruner/scop/resnet50_cifar10_config.yaml
@@ -23,8 +23,8 @@ batch_size: 32
 loss_scale: 1024
 momentum: 0.9
 weight_decay: 0.0001
-epoch_size: 90
-epochs_ft: 400
+epoch_kf: 90
+epoch_ft: 400
 pretrain_epoch_size: 0
 save_checkpoint: True
 save_checkpoint_epochs: 5
@@ -32,12 +32,10 @@ keep_checkpoint_max: 10
 warmup_epochs: 5
 lr_decay_mode: "poly"
 lr_init: 0.01
-lr_end: 0.00001
-lr_max: 0.1
-lars_epsilon: 0.0
-lars_coefficient: 0.001
-lr_ft_end: 0.0001
-lr_ft_max: 0.4
+lr_end_kf: 0.0001
+lr_max_kf: 0.1
+lr_end_ft: 0.0001
+lr_max_ft: 0.04
 
 net_name: "resnet50"
 dataset: "cifar10"
diff --git a/official/cv/resnet/golden_stick/pruner/scop/train.py b/official/cv/resnet/golden_stick/pruner/scop/train.py
index 6299c079b..a2a64a9d8 100644
--- a/official/cv/resnet/golden_stick/pruner/scop/train.py
+++ b/official/cv/resnet/golden_stick/pruner/scop/train.py
@@ -239,9 +239,9 @@ def train_net():
 
     lr = get_lr(lr_init=config.lr_init,
                 lr_end=0.0,
-                lr_max=config.lr_max,
+                lr_max=config.lr_max_kf,
                 warmup_epochs=config.warmup_epochs,
-                total_epochs=config.epoch_size,
+                total_epochs=config.epoch_kf,
                 steps_per_epoch=step_size,
                 lr_decay_mode='cosine')
 
@@ -256,12 +256,28 @@ def train_net():
     net_with_loss = NetWithLossCell(model, kf_loss_fn, 1)
 
     net_train_step = nn.TrainOneStepCell(net_with_loss, optimizer)
-    train_kf(dataset, net_train_step, model, kfconv_list, kfscale_list)
+    if config.pre_trained:
+        for _, (_, module) in enumerate(model.cells_and_names()):
+            if isinstance(module, KfConv2d):
+                module.score = module.bn.gamma.data.abs() * ops.Squeeze()(
+                    module.kfscale.data - (1 - module.kfscale.data))
+                module.prune_rate = config.prune_rate
+        for _, (_, module) in enumerate(model.cells_and_names()):
+            if isinstance(module, KfConv2d):
+                _, index = ops.Sort()(module.score)
+                num_pruned_channel = int(module.prune_rate * module.score.shape[0])
+                module.out_index = index[num_pruned_channel:]
+        for param in model.get_parameters():
+            param.requires_grad = True
+        train_ft(model, dataset)
+    else:
+        model = train_kf(dataset, net_train_step, model, kfconv_list, kfscale_list)
+        train_ft(model, dataset)
 
 
 def train_kf(dataset, net_train_step, model, kfconv_list, kfscale_list):
     """train konckoff."""
-    for _ in range(0, config.epoch_size):
+    for _ in range(0, config.epoch_kf):
         from copy import deepcopy
         for _, (kf_data, kf_target) in enumerate(dataset.create_tuple_iterator()):
             kf = deepcopy(kf_data)
@@ -304,18 +320,21 @@ def train_kf(dataset, net_train_step, model, kfconv_list, kfscale_list):
             _, index = ops.Sort()(module.score)
             num_pruned_channel = int(module.prune_rate * module.score.shape[0])
             module.out_index = index[num_pruned_channel:]
-    train_ft(model, dataset)
+    return model
 
 
 def train_ft(model, dataset):
     """train finetune."""
     algo_ft = PrunerFtCompressAlgo({})
     model = algo_ft.apply(model)
+    if config.pre_trained:
+        pre_ckpt = ms.load_checkpoint(config.pre_trained)
+        ms.load_param_into_net(model, pre_ckpt)
     lr_ft_new = ms.Tensor(get_lr(lr_init=config.lr_init,
-                                 lr_end=config.lr_ft_end,
-                                 lr_max=config.lr_ft_max,
+                                 lr_end=config.lr_end_ft,
+                                 lr_max=config.lr_max_ft,
                                  warmup_epochs=config.warmup_epochs,
-                                 total_epochs=config.epochs_ft,
+                                 total_epochs=config.epoch_ft,
                                  steps_per_epoch=dataset.get_dataset_size(),
                                  lr_decay_mode='poly'))
 
@@ -337,13 +356,14 @@ def train_ft(model, dataset):
 
     time_cb = TimeMonitor(data_size=step_size)
     loss_cb = LossMonitor()
+    ckpt_save_dir = set_save_ckpt_dir()
     config_ck = CheckpointConfig(save_checkpoint_steps=5 * step_size,
                                  keep_checkpoint_max=10)
-    ckpt_cb = ModelCheckpoint(prefix="resnet", directory=config.output_path,
+    ckpt_cb = ModelCheckpoint(prefix="resnet", directory=ckpt_save_dir,
                               config=config_ck)
     ft_cb = [time_cb, loss_cb, ckpt_cb]
 
-    model_ft.train(config.epochs_ft, dataset, callbacks=ft_cb,
+    model_ft.train(config.epoch_ft, dataset, callbacks=ft_cb,
                    sink_size=dataset.get_dataset_size(), dataset_sink_mode=True)
 
     masked_conv_list = []
diff --git a/official/cv/resnet/golden_stick/scripts/run_distribute_train.sh b/official/cv/resnet/golden_stick/scripts/run_distribute_train.sh
new file mode 100644
index 000000000..5eef1db30
--- /dev/null
+++ b/official/cv/resnet/golden_stick/scripts/run_distribute_train.sh
@@ -0,0 +1,130 @@
+#!/bin/bash
+# Copyright 2022 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+CURPATH="$(dirname "$0")"
+
+if [ $# != 4 ] && [ $# != 5 ]
+then
+  echo "Usage: bash run_distribute_train.sh [RANK_TABLE_FILE] [PYTHON_PATH] [CONFIG_PATH] [DATASET_PATH] [CKPT_TYPE](optional) [CKPT_PATH](optional)"
+  exit 1
+fi
+
+get_real_path(){
+  if [ "${1:0:1}" == "/" ]; then
+    echo "$1"
+  else
+    echo "$(realpath -m $PWD/$1)"
+  fi
+}
+
+PATH1=$(get_real_path $1)
+PYTHON_PATH=$(get_real_path $2)
+CONFIG_FILE=$(get_real_path $3)
+PATH2=$(get_real_path $4)
+
+if [ ! -d $PYTHON_PATH ]
+then
+    echo "error: PYTHON_PATH=$PYTHON_PATH is not a directory"
+    exit 1
+fi
+
+if [ $# == 5 ]
+then
+  PATH3=$(get_real_path $5)
+fi
+
+if [ ! -f $PATH1 ]
+then 
+    echo "error: RANK_TABLE_FILE=$PATH1 is not a file"
+exit 1
+fi 
+
+if [ ! -d $PATH2 ]
+then 
+    echo "error: DATASET_PATH=$PATH2 is not a directory"
+exit 1
+fi 
+
+if [ $# == 5 ] && [ ! -f $PATH3 ]
+then
+    echo "error: FP32_CKPT=$PATH3 is not a file"
+exit 1
+fi
+if [ $# == 5 ]; then
+  CKPT_TYPE=$4
+  CKPT_FILE=$(get_real_path $5)
+
+  if [ "x$CKPT_TYPE" != "xFP32" ] && [ "x$CKPT_TYPE" != "xPRETRAINED" ]; then
+      echo "error: CKPT_TYPE=$CKPT_TYPE is not valid, should be FP32 or PRETRAINED"
+      exit 1
+  fi
+  if [ ! -f $CKPT_FILE ]; then
+      echo "error: CKPT_FILE=$CKPT_FILE is not a file"
+      exit 1
+  fi
+fi
+
+if [ "x${RUN_EVAL}" == "xTrue" ] && [ ! -d $EVAL_DATASET_PATH ]
+then
+  echo "error: EVAL_DATASET_PATH=$EVAL_DATASET_PATH is not a directory"
+  exit 1
+fi
+
+if [ "x${RUN_EVAL}" == "xTrue" ]
+then
+  bootup_cache_server
+fi
+
+ulimit -u unlimited
+export DEVICE_NUM=8
+export RANK_SIZE=8
+export RANK_TABLE_FILE=$PATH1
+
+export SERVER_ID=0
+rank_start=$((DEVICE_NUM * SERVER_ID))
+
+cpus=`cat /proc/cpuinfo| grep "processor"| wc -l`
+avg=`expr $cpus \/ $DEVICE_NUM`
+gap=`expr $avg \- 1`
+
+for((i=0; i<${DEVICE_NUM}; i++))
+do
+    start=`expr $i \* $avg`
+    end=`expr $start \+ $gap`
+    cmdopt=$start"-"$end
+    export DEVICE_ID=${i}
+    export RANK_ID=$((rank_start + i))
+    rm -rf ./train_parallel$i
+    mkdir ./train_parallel$i
+    cp ${PYTHON_PATH}/*.py ./train_parallel$i
+    cp *.sh ./train_parallel$i
+    cp -r ${CURPATH}/../../src ./train_parallel$i
+    cd ./train_parallel$i || exit
+    echo "start training for rank $RANK_ID, device $DEVICE_ID"
+    env > env.log
+    
+    if [ "x$CKPT_TYPE" == "xFP32" ]; then
+        taskset -c $cmdopt python3 train.py --run_distribute=True --device_num=$RANK_SIZE --data_path=$PATH2 \
+        --config_path=$CONFIG_FILE --fp32_ckpt=$CKPT_FILE --output_path './output' &> log &
+    elif [ "x$CKPT_TYPE" == "xPRETRAINED" ]; then
+        taskset -c $cmdopt python3 train.py --run_distribute=True --device_num=$RANK_SIZE --data_path=$PATH2 \
+        --config_path=$CONFIG_FILE --pre_trained=$CKPT_FILE --output_path './output' &> log &
+    else
+        taskset -c $cmdopt python3 train.py --run_distribute=True --device_num=$RANK_SIZE --data_path=$PATH2 \
+        --config_path=$CONFIG_FILE --output_path './output' &> log &
+    fi
+    cd ..
+done
diff --git a/official/cv/resnet/golden_stick/scripts/run_standalone_train.sh b/official/cv/resnet/golden_stick/scripts/run_standalone_train.sh
new file mode 100644
index 000000000..bae9f5de9
--- /dev/null
+++ b/official/cv/resnet/golden_stick/scripts/run_standalone_train.sh
@@ -0,0 +1,98 @@
+#!/bin/bash
+# Copyright 2022 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+CURPATH="$(dirname "$0")"
+
+if [ $# != 3 ] && [ $# != 5 ]; then
+    echo "Usage: bash run_standalone_train.sh [PYTHON_PATH] [CONFIG_FILE] [DATASET_PATH] [CKPT_TYPE](optional) [CKPT_PATH](optional)"
+    echo "Examples:"
+    echo "  Train from the beginning:"
+    echo "    bash run_standalone_train.sh /path/to/train.py resnet50_config.yaml /path/to/dataset"
+    echo "  Train from full precision checkpoint:"
+    echo "    bash run_standalone_train.sh /path/to/train.py resnet50_config.yaml /path/to/dataset FP32 /path/to/fp32_ckpt"
+    echo "  Train from pretrained checkpoint:"
+    echo "    bash run_standalone_train.sh /path/to/train.py resnet50_config.yaml /path/to/dataset PRETRAINED /path/to/pretrained_ckpt"
+    exit 1
+fi
+
+get_real_path(){
+  if [ "${1:0:1}" == "/" ]; then
+    echo "$1"
+  else
+    echo "$(realpath -m $PWD/$1)"
+  fi
+}
+
+PYTHON_PATH=$(get_real_path $1)
+CONFIG_FILE=$(get_real_path $2)
+DATASET_PATH=$(get_real_path $3)
+
+if [ ! -d $PYTHON_PATH ]
+then
+    echo "error: PYTHON_PATH=$PYTHON_PATH is not a directory"
+    exit 1
+fi
+
+if [ ! -f $CONFIG_FILE ]
+then
+    echo "error: CONFIG_FILE=$CONFIG_FILE is not a file"
+    exit 1
+fi
+
+if [ ! -d $DATASET_PATH ]
+then 
+    echo "error: DATASET_PATH=$DATASET_PATH is not a directory"
+    exit 1
+fi 
+
+if [ $# == 5 ]; then
+  CKPT_TYPE=$4
+  CKPT_FILE=$(get_real_path $5)
+
+  if [ "x$CKPT_TYPE" != "xFP32" ] && [ "x$CKPT_TYPE" != "xPRETRAINED" ]; then
+      echo "error: CKPT_TYPE=$CKPT_TYPE is not valid, should be FP32 or PRETRAINED"
+      exit 1
+  fi
+  if [ ! -f $CKPT_FILE ]; then
+      echo "error: CKPT_FILE=$CKPT_FILE is not a file"
+      exit 1
+  fi
+fi
+
+ulimit -u unlimited
+export DEVICE_NUM=1
+export RANK_ID=0
+export RANK_SIZE=1
+
+if [ -d "train" ];
+then
+    rm -rf ./train
+fi
+mkdir ./train
+cp ${PYTHON_PATH}/*.py ./train
+cp -r ${CURPATH}/../../src ./train
+cd ./train || exit
+
+if [ "x$CKPT_TYPE" == "xFP32" ]; then
+  python train.py --config_path=$CONFIG_FILE --run_distribute=False \
+         --data_path=$DATASET_PATH --fp32_ckpt=$CKPT_FILE --output_path './output' &> log &
+elif [ "x$CKPT_TYPE" == "xPRETRAINED" ]; then
+  python train.py --config_path=$CONFIG_FILE --run_distribute=False \
+         --data_path=$DATASET_PATH --pre_trained=$CKPT_FILE --output_path './output' &> log &
+else
+  python train.py --config_path=$CONFIG_FILE --run_distribute=False \
+         --data_path=$DATASET_PATH --output_path './output' &> log &
+fi
-- 
GitLab