!2172 pointnet gpu version

Merge pull request !2172 from jialing/master

!2172 pointnet gpu version
Merge pull request !2172 from jialing/master
c1090753 · zhaoting · Gitee · e06d93b3 · 5e162594 · c1090753
Unverified Commit c1090753 authored 3 years ago by zhaoting Committed by Gitee 3 years ago
--- a/research/cv/pointnet/README.md
+++ b/research/cv/pointnet/README.md
@@ -86,10 +86,13 @@ bash scripts/run_standalone_eval.sh '/home/pointnet/shapenetcore_partanno_segmen
        │   │   ├── utils.cc
        │   ├── build.sh
        ├── scripts
-        │   ├── run_distribute_train.sh  # launch distributed training with ascend platform (8p)
-        │   ├── run_standalone_eval.sh   # launch evaluating with ascend platform
-        │   ├── run_infer_310.sh         # run 310 infer
-        │   └── run_standalone_train.sh  # launch standalone training with ascend platform (1p)
+        │   ├── run_distribute_ascend.sh        # launch distributed training with ascend platform (8p)
+        │   ├── run_distribute_gpu.sh           # launch distributed training with gpu platform (8p)
+        │   ├── run_standalone_eval_ascend.sh          # launch evaluating with ascend platform (1p)
+        │   ├── run_standalone_eval_gpu.sh      # launch evaluating with gpu platform (1p)
+        │   ├── run_infer_310.sh                # run 310 infer
+        │   ├── run_standalone_train_ascend.sh         # launch standalone training with ascend platform (1p)
+        │   └── run_standalone_train_gpu.sh     # launch standalone training with gpu platform (1p)
        ├── src
        │   ├── misc                     # dataset part
        │   ├── dataset.py               # data preprocessing
@@ -109,10 +112,10 @@ bash scripts/run_standalone_eval.sh '/home/pointnet/shapenetcore_partanno_segmen

 ```bash
 Major parameters in train.py are as follows:
+--device_id        # train on which device
 --batchSize        # Training batch size.
 --nepoch           # Total training epochs.
 --learning_rate    # Training learning rate.
--device_id        # train on which device
 --data_url         # The path to the train and evaluation datasets.
 --loss_per_epoch   # The times to print loss value per epoch.
 --train_url        # The path to save files generated during training.
@@ -127,15 +130,32 @@ Major parameters in train.py are as follows:
 - running on Ascend

 ```shell
-# Run stand-alone training
-bash scripts/run_standalone_train.sh [DATA_PATH] [SAVE_DIR] [PRETRAINDE_CKPT(optional)]
+# Run stand-alone training for Ascend
+bash scripts/run_standalone_train_ascend.sh [DATA_PATH] [CKPT_PATH] [DEVICE_ID]
 # example:
-bash scripts/run_standalone_train.sh modelnet40_normal_resampled save pointnet2.ckpt
+bash scripts/run_standalone_train_ascend.sh ../shapenetcore_partanno_segmentation_benchmark_v0 ./ckpts 1

-# Run distributed training
-bash scripts/run_distributed_train.sh [RANK_TABLE_FILE] [DATA_PATH] [SAVE_DIR] [PRETRAINDE_CKPT(optional)]
+
+
+# Run distributed training for Ascend
+bash scripts/run_distribution_ascend.sh [RANK_TABLE_FILE] [CKPTS_DIR] [DATA_PATH]
 # example:
-bash scripts/run_standalone_train.sh hccl_8p_01234567_127.0.0.1.json modelnet40_normal_resampled save pointnet2.ckpt
+bash scripts/run_distribution_ascend.sh [RANK_TABLE_FILE] ./ckpts ../shapenetcore_partanno_segmentation_benchmark_v0
+
+
+```
+
+- running on GPU
+
+```shell
+# Run stand-alone training for GPU
+bash scripts/run_standalone_train_gpu.sh [DATA_PATH] [CKPT_PATH] [DEVICE_ID]
+# example:
+bash scripts/run_standalone_train_gpu.sh ../shapenetcore_partanno_segmentation_benchmark_v0 ./ckpts 1
+# Run distributed training for GPU
+bash scripts/run_distribute_gpu.sh [DATA_PATH] [CKPT_PATH]
+# example:
+bash scripts/run_distribute_gpu.sh ./ckpts ../shapenetcore_partanno_segmentation_benchmark_v0
 ```

 Distributed training requires the creation of an HCCL configuration file in JSON format in advance. For specific
@@ -146,15 +166,15 @@ After training, the loss value will be achieved as follows:

 ```bash
 # train log
-Epoch : 1/25  episode : 1/40   Loss : 1.3433  Accuracy : 0.489538 step_time: 1.4269
-Epoch : 1/25  episode : 2/40   Loss : 1.2932  Accuracy : 0.541544 step_time: 1.4238
-Epoch : 1/25  episode : 3/40   Loss : 1.2558  Accuracy : 0.567900 step_time: 1.4397
-Epoch : 1/25  episode : 4/40   Loss : 1.1843  Accuracy : 0.654681 step_time: 1.4235
-Epoch : 1/25  episode : 5/40   Loss : 1.1262  Accuracy : 0.726756 step_time: 1.4206
-Epoch : 1/25  episode : 6/40   Loss : 1.1000  Accuracy : 0.736225 step_time: 1.4363
-Epoch : 1/25  episode : 7/40   Loss : 1.0487  Accuracy : 0.814338 step_time: 1.4457
-Epoch : 1/25  episode : 8/40   Loss : 1.0271  Accuracy : 0.782350 step_time: 1.4183
-Epoch : 1/25  episode : 9/40   Loss : 0.9777  Accuracy : 0.831025 step_time: 1.4289
+Epoch : 1/50  episode : 1/40   Loss : 1.3433  Accuracy : 0.489538 step_time: 1.4269
+Epoch : 1/50  episode : 2/40   Loss : 1.2932  Accuracy : 0.541544 step_time: 1.4238
+Epoch : 1/50  episode : 3/40   Loss : 1.2558  Accuracy : 0.567900 step_time: 1.4397
+Epoch : 1/50  episode : 4/40   Loss : 1.1843  Accuracy : 0.654681 step_time: 1.4235
+Epoch : 1/50  episode : 5/40   Loss : 1.1262  Accuracy : 0.726756 step_time: 1.4206
+Epoch : 1/50  episode : 6/40   Loss : 1.1000  Accuracy : 0.736225 step_time: 1.4363
+Epoch : 1/50  episode : 7/40   Loss : 1.0487  Accuracy : 0.814338 step_time: 1.4457
+Epoch : 1/50  episode : 8/40   Loss : 1.0271  Accuracy : 0.782350 step_time: 1.4183
+Epoch : 1/50  episode : 9/40   Loss : 0.9777  Accuracy : 0.831025 step_time: 1.4289

 ...
 ```
@@ -170,16 +190,32 @@ Before running the command below, please check the checkpoint path used for eval
 - running on Ascend

 ```shell
-# Evaluate
-bash scripts/run_eval.sh [DATA_PATH] [CKPT_NAME]
+# Evaluate on ascend
+bash scripts/run_standalone_eval_ascend.sh [DATA_PATH] [MODEL_PATH] [DEVICE_ID]
+# example:
+bash scripts/run_standalone_eval_ascend.sh shapenetcore_partanno_segmentation_benchmark_v0 pointnet.ckpt 0
+```
+
+You can view the results through the file "log_standalone_eval_ascend". The accuracy of the test dataset will be as follows:
+
+```bash
+# grep "mIOU " log_standalone_eval_ascend
+'mIOU for class Chair: 0.869'
+```
+
+- running on GPU
+
+```shell
+  # Evaluate on GPU
+bash scripts/run_standalone_eval_gpu.sh [DATA_PATH] [MODEL_PATH] [DEVICE_ID]
 # example:
-bash scripts/run_eval.sh shapenetcore_partanno_segmentation_benchmark_v0 pointnet.ckpt
+bash scripts/run_standalone_eval_gpu.sh shapenetcore_partanno_segmentation_benchmark_v0 pointnet.ckpt 0
 ```

-You can view the results through the file "eval.log". The accuracy of the test dataset will be as follows:
+You can view the results through the file "log_standalone_eval_gpu". The accuracy of the test dataset will be as follows:

 ```bash
-# grep "mIOU " eval.log
+# grep "mIOU " log_standalone_eval_gpu
 'mIOU for class Chair: 0.869'
 ```

@@ -219,35 +255,35 @@ Here, DVPP should be 'N'!

 ## Training Performance

-| Parameters                 | Ascend                                                      |
-| -------------------------- | ----------------------------------------------------------- |
-| Model Version              | PointNet                                                  |
-| Resource                   | Ascend 910; CPU 24cores; Memory 256G; OS Euler2.8           |
-| uploaded Date              | 11/30/2021 (month/day/year)                                 |
-| MindSpore Version          | 1.3.0                                                       |
-| Dataset                    | A subset of ShapeNet                                                  |
-| Training Parameters        | epoch=25, steps=83, batch_size=64, lr=0.005             |
-| Optimizer                  | Adam                                                        |
-| Loss Function              | NLLLoss                                                     |
-| outputs                    | probability                                                 |
-| Loss                       | 0.01                                                        |
-| Speed                      | 1.5 s/step (1p)                                             |
-| Total time                 | 0.3 h (1p)                                                 |
-| Checkpoint for Fine tuning | 17 MB (.ckpt file)                                          |
+| Parameters                 | Ascend                                            | GPU(V100(PCIE))                             |
+| -------------------------- |---------------------------------------------------|---------------------------------------------|
+| Model Version              | PointNet                                          | PointNet                                    |
+| Resource                   | Ascend 910; CPU 24cores; Memory 256G; OS Euler2.8 | NVIDIA RTX Titan-24G                        |
+| uploaded Date              | 11/30/2021 (month/day/year)                       | 4/19/2022 (month/day/year)                  |
+| MindSpore Version          | 1.3.0                                             | 1.3.0 1.5.0 1.6.0                           |
+| Dataset                    | A subset of ShapeNet                              | A subset of ShapeNet                        |
+| Training Parameters        | epoch=50, steps=83, batch_size=64, lr=0.005       | epoch=50, steps=83, batch_size=64, lr=0.005 |
+| Optimizer                  | Adam                                              | Adam                                        |
+| Loss Function              | NLLLoss                                           | NLLLoss                                     |
+| outputs                    | probability                                       | probability                                 |
+| Loss                       | 0.01                                              | 0.01                                        |
+| Speed                      | 1.5 s/step (1p)                                   | 0.19 s/step (1p)                            |
+| Total time                 | 0.3 h (1p)                                        | 10 m (1p)                                   |
+| Checkpoint for Fine tuning | 17 MB (.ckpt file)                                | 17 MB (.ckpt file)                          |

 ## Inference Performance

-| Parameters          | Ascend                      |
-| ------------------- | --------------------------- |
-| Model Version       | PointNet                  |
-| Resource            | Ascend 910; CPU 24cores; Memory 256G; OS Euler2.8 |
-| Uploaded Date       | 11/30/2021 (month/day/year) |
-| MindSpore Version   | 1.3.0                       |
-| Dataset             | A subset of ShapeNet                 |
-| Batch_size          | 64                          |
-| Outputs             | probability                 |
-| mIOU                | 86.3% (1p)                  |
-| Total time          | 1 min                     |
+| Parameters          | Ascend                                            | GPU(V100(PCIE))            |
+| ------------------- |---------------------------------------------------|---------------------------|
+| Model Version       | PointNet                                          | PointNet                  |
+| Resource            | Ascend 910; CPU 24cores; Memory 256G; OS Euler2.8 | NVIDIA RTX Titan-24G      |
+| Uploaded Date       | 11/30/2021 (month/day/year)                       | 4/19/2022 (month/day/year) |
+| MindSpore Version   | 1.3.0                                             | 1.3.0 1.5.0 1.6.0         |
+| Dataset             | A subset of ShapeNet                              | A subset of ShapeNet      |
+| Batch_size          | 64                                                | 64                        |
+| Outputs             | probability                                       | probability               |
+| mIOU                | 86.3% (1p)                                        | 86.3% (1p)                |
+| Total time          | 1 min                                             | 1 min                     |

 # [Description of Random Situation](#contents)


--- a/research/cv/pointnet/ascend310_infer/src/main.cc
+++ b/research/cv/pointnet/ascend310_infer/src/main.cc
 /**
- * Copyright 2021 Huawei Technologies Co., Ltd
+ * Copyright 2022 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.

--- a/research/cv/pointnet/ascend310_infer/src/utils.cc
+++ b/research/cv/pointnet/ascend310_infer/src/utils.cc
 /**
- * Copyright 2021 Huawei Technologies Co., Ltd
+ * Copyright 2022 Huawei Technologies Co., Ltd
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.

--- a/research/cv/pointnet/eval.py
+++ b/research/cv/pointnet/eval.py
-# Copyright 2021 Huawei Technologies Co., Ltd
+# Copyright 2022 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -23,7 +23,6 @@ import mindspore
 from mindspore import load_checkpoint, load_param_into_net, context
 import mindspore.dataset as ds
 import mindspore.ops as ops
-from mindspore.communication.management import init, get_rank
 from src.dataset import ShapeNetDataset
 from src.network import PointNetDenseCls
 from tqdm import tqdm
@@ -105,30 +104,14 @@ if __name__ == "__main__":
        context.set_context(save_graphs=False)
        if device_target == "Ascend":
            context.set_context(device_id=device_id)
-            if device_num > 1:
-                cfg.episode = int(cfg.episode / 2)
-                cfg.learning_rate = cfg.learning_rate * 2
-                context.reset_auto_parallel_context()
-                context.set_auto_parallel_context(device_num=device_num,
-                                                  parallel_mode=context.ParallelMode.DATA_PARALLEL, gradients_mean=True)
-                init()
-                local_data_url = os.path.join(local_data_url, str(device_id))
-                local_train_url = os.path.join(local_train_url, "_" + str(get_rank()))
        else:
            raise ValueError("Unsupported platform.")
        import moxing as mox

        mox.file.copy_parallel(src_url=args.data_url, dst_url=local_data_url)
    else:
-        context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, device_id=args.device_id)
+        context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target)
        context.set_context(save_graphs=False)
-        if device_num > 1:
-            cfg.episode = int(cfg.episode / 2)
-            cfg.learning_rate = cfg.learning_rate * 2
-            context.reset_auto_parallel_context()
-            context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL,
-                                              gradients_mean=True)
-            init()

    if not os.path.exists(local_train_url):
        os.makedirs(local_train_url)
@@ -138,7 +121,6 @@ if __name__ == "__main__":
    random.seed(args.manualSeed)
    mindspore.set_seed(args.manualSeed)
    dataset_sink_mode = False
-    context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=args.device_id)

    dataset_generator = ShapeNetDataset(
        root=local_data_url,

--- a/research/cv/pointnet/postprocess.py
+++ b/research/cv/pointnet/postprocess.py
-# Copyright 2021 Huawei Technologies Co., Ltd
+# Copyright 2022 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/research/cv/pointnet/preprocess.py
+++ b/research/cv/pointnet/preprocess.py
-# Copyright 2021 Huawei Technologies Co., Ltd
+# Copyright 2022 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -37,7 +37,8 @@ if __name__ == '__main__':
        root=args.dataset_path,
        classification=False,
        split='test',
-        class_choice=[args.class_choice])
+        class_choice=[args.class_choice],
+        data_augmentation=False)
    dataset = ds.GeneratorDataset(dataset_generator, column_names=["point", "label"])
    dataset = dataset.batch(args.batchSize)


--- a/research/cv/pointnet/scripts/run_distribution.sh
+++ b/research/cv/pointnet/scripts/run_distribution.sh
 #!/bin/bash
-# Copyright 2021 Huawei Technologies Co., Ltd
+# Copyright 2022 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@

 if [ $# != 3 ]
 then
-    echo "Usage: sh run_distribution_ascend.sh [RANK_TABLE_FILE] [CKPTS_DIR] [DATA_PATH]"
+    echo "Usage: bash scripts/run_distribution_ascend.sh [RANK_TABLE_FILE] [CKPTS_DIR] [DATA_PATH]"
 exit 1
 fi

@@ -52,11 +52,11 @@ do
    export RANK_ID=$((rank_start + i))
    rm -rf ./train_parallel$i
    mkdir ./train_parallel$i
-    cp -r ../src ./train_parallel$i
-    cp ../train.py ./train_parallel$i
+    cp -r ./src ./train_parallel$i
+    cp ./train.py ./train_parallel$i
    echo "start training for rank $RANK_ID, device $DEVICE_ID"
    cd ./train_parallel$i ||exit
    env > env.log
-    python -u train.py --device_id=$i --train_url=$CKPTS_DIR --data_url=$DATA_PATH 
+    nohup python -u train.py --device_id=$i --train_url=$CKPTS_DIR --data_url=$DATA_PATH >log_distribution_ascend 2>&1 &
    cd ..
 done
--- a/research/cv/pointnet/scripts/run_distribution_gpu.sh
+++ b/research/cv/pointnet/scripts/run_distribution_gpu.sh
+#!/bin/bash
+# Copyright 2022 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+if [ $# -ne 2 ]
+then 
+    echo "Usage: bash scripts/run_distribute_gpu.sh [DATA_PATH] [CKPT_PATH]"
+exit 1
+fi
+DATA_PATH=$1
+CKPT_PATH=$2
+
+
+export RANK_SIZE=8
+
+echo "======start training======"
+
+mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \
+nohup python ./train.py \
+  --data_url=$DATA_PATH \
+  --device_target="GPU" \
+  --train_url=$CKPT_PATH \
+  --nepoch=50 > log_distribution_gpu 2>&1 &
+cd ..
\ No newline at end of file
--- a/research/cv/pointnet/scripts/run_infer_310.sh
+++ b/research/cv/pointnet/scripts/run_infer_310.sh
 #!/bin/bash
-# Copyright 2021 Huawei Technologies Co., Ltd
+# Copyright 2022 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/research/cv/pointnet/scripts/run_standalone_eval_ascend.sh
+++ b/research/cv/pointnet/scripts/run_standalone_eval_ascend.sh
+#!/bin/bash
+# Copyright 2022 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+if [ $# -ne 3 ]
+then 
+    echo "Usage: bash scripts/run_standalone_eval_ascend.sh [DATA_PATH] [MODEL_PATH] [DEVICE_ID]"
+exit 1
+fi
+DATA_PATH=$1
+MODEL_PATH=$2
+DEVICE_ID=$3
+get_real_path(){
+  if [ "${1:0:1}" == "/" ]; then
+    echo "$1"
+  else
+    echo "$(realpath -m $PWD/$1)"
+  fi
+}
+
+PATH1=$(get_real_path $1)
+echo $PATH1
+PATH2=$(get_real_path $2)
+echo $PATH2
+if [ $# == 3 ]; then
+    DEVICE_ID=$3
+fi
+
+
+export RANK_SIZE=1
+
+echo "======start training======"
+
+export DEVICE_ID=$DEVICE_ID
+nohup python ./eval.py \
+  --data_path=$DATA_PATH \
+  --device_target="Ascend" \
+  --model_path=$MODEL_PATH > log_standalone_eval_ascend 2>&1 &
--- a/research/cv/pointnet/scripts/run_standalone_eval_gpu.sh
+++ b/research/cv/pointnet/scripts/run_standalone_eval_gpu.sh
+#!/bin/bash
+# Copyright 2022 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+if [ $# -ne 3 ]
+then 
+    echo "Usage: bash scripts/run_standalone_eval_gpu.sh [DATA_PATH] [MODEL_PATH] [DEVICE_ID]"
+exit 1
+fi
+DATA_PATH=$1
+MODEL_PATH=$2
+DEVICE_ID=$3
+get_real_path(){
+  if [ "${1:0:1}" == "/" ]; then
+    echo "$1"
+  else
+    echo "$(realpath -m $PWD/$1)"
+  fi
+}
+
+PATH1=$(get_real_path $1)
+echo $PATH1
+PATH2=$(get_real_path $2)
+echo $PATH2
+if [ $# == 3 ]; then
+    DEVICE_ID=$3
+fi
+
+
+export RANK_SIZE=1
+
+echo "======start training======"
+
+export CUDA_VISIBLE_DEVICES=$DEVICE_ID
+nohup python ./eval.py \
+  --data_path=$DATA_PATH \
+  --device_target="GPU" \
+  --model_path=$MODEL_PATH >log_standalone_eval_gpu 2>&1 &
--- a/research/cv/pointnet/scripts/run_standalone_train.sh
+++ b/research/cv/pointnet/scripts/run_standalone_train.sh
 #!/bin/bash
-# Copyright 2020 Huawei Technologies Co., Ltd
+# Copyright 2022 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,8 +14,21 @@
 # limitations under the License.
 # ============================================================================

-# an simple tutorial as follows, more parameters can be setting
-# script_self=$(readlink -f "$0")
+if [ $# -ne 3 ]
+then 
+    echo "Usage: bash scripts/run_standalone_train_ascend.sh [DATA_PATH] [CKPT_PATH] [DEVICE_ID]"
+exit 1
+fi
 DATA_PATH=$1
 CKPT_PATH=$2
-python -s ../train.py --data_url=$DATA_PATH --device_target="Ascend" --train_url=$CKPT_PATH > log 2>&1 &
+DEVICE_ID=$3
+
+export RANK_SIZE=1
+
+echo "======start training======"
+
+export DEVICE_ID=$DEVICE_ID
+nohup python ./train.py \
+  --data_url=$DATA_PATH \
+  --device_target="Ascend" \
+  --train_url=$CKPT_PATH >log_standalone_train_ascend 2>&1 &
--- a/research/cv/pointnet/scripts/run_standalone_eval.sh
+++ b/research/cv/pointnet/scripts/run_standalone_eval.sh
 #!/bin/bash
-# Copyright 2020 Huawei Technologies Co., Ltd
+# Copyright 2022 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,9 +14,21 @@
 # limitations under the License.
 # ============================================================================

-# an simple tutorial as follows, more parameters can be setting
-# script_self=$(readlink -f "$0")
-# self_path=$(dirname "${script_self}")
+if [ $# -ne 3 ]
+then 
+    echo "Usage: bash scripts/run_standalone_train_gpu.sh [DATA_PATH] [CKPT_PATH] [DEVICE_ID]"
+exit 1
+fi
 DATA_PATH=$1
-MODEL_PATH=$2
-python -s ../eval.py --data_path=$DATA_PATH --device_target="Ascend" --model_path=$MODEL_PATH > log_eval 2>&1 &
+CKPT_PATH=$2
+DEVICE_ID=$3
+
+export RANK_SIZE=1
+
+echo "======start training======"
+
+export CUDA_VISIBLE_DEVICES=$DEVICE_ID
+nohup python ./train.py \
+  --data_url=$DATA_PATH \
+  --device_target="GPU" \
+  --train_url=$CKPT_PATH > log_standalone_train_gpu 2>&1 &
--- a/research/cv/pointnet/src/dataset.py
+++ b/research/cv/pointnet/src/dataset.py
-# Copyright 2021 Huawei Technologies Co., Ltd
+# Copyright 2022 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/research/cv/pointnet/src/export.py
+++ b/research/cv/pointnet/src/export.py
-# Copyright 2021 Huawei Technologies Co., Ltd
+# Copyright 2022 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/research/cv/pointnet/src/loss.py
+++ b/research/cv/pointnet/src/loss.py
-# Copyright 2021 Huawei Technologies Co., Ltd
+# Copyright 2022 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/research/cv/pointnet/src/network.py
+++ b/research/cv/pointnet/src/network.py
-# Copyright 2021 Huawei Technologies Co., Ltd
+# Copyright 2022 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/research/cv/pointnet/src/preprocess.py
+++ b/research/cv/pointnet/src/preprocess.py
-# Copyright 2021 Huawei Technologies Co., Ltd
+# Copyright 2022 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -38,7 +38,8 @@ if __name__ == '__main__':
        root=args.dataset_path,
        classification=False,
        split='test',
-        class_choice=[args.class_choice])
+        class_choice=[args.class_choice],
+        data_augmentation=False)
    dataset = ds.GeneratorDataset(dataset_generator, column_names=["point", "label"])
    dataset = dataset.batch(args.batchSize)


--- a/research/cv/pointnet/train.py
+++ b/research/cv/pointnet/train.py
-# Copyright 2021 Huawei Technologies Co., Ltd
+# Copyright 2022 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -40,12 +40,12 @@ parser = argparse.ArgumentParser(description='MindSpore Pointnet Segmentation')
 parser.add_argument(
    '--batchSize', type=int, default=64, help='input batch size')
 parser.add_argument(
-    '--nepoch', type=int, default=25, help='number of epochs to train for')
+    '--nepoch', type=int, default=50, help='number of epochs to train for')
 parser.add_argument('--model', type=str, default='', help='model path')
 parser.add_argument('--device_id', type=int, default=5, help='device id')
 parser.add_argument('--learning_rate', type=float, default=0.0005, help='device id')
 parser.add_argument('--device_target', default='Ascend', help='device target')
-parser.add_argument('--data_url', type=str, default='/home/pointnet/shapenetcore_partanno_segmentation_benchmark_v0'
+parser.add_argument('--data_url', type=str, default='../shapenetcore_partanno_segmentation_benchmark_v0'
                    , help="dataset path")
 parser.add_argument('--train_url', type=str, default='./ckpts'
                    , help="ckpts path")
@@ -59,7 +59,7 @@ reshape = ops.Reshape()
 print(args)


-def train_model(_net_train, network, _dataset, _test_dataset, _num_classes):
+def train_model(_net_train, network, _dataset, _test_dataset, _num_classes, rank_id=0):
    """train_model"""
    print('loading data')
    print(time.strftime("%Y-%m-%d  %H:%M:%S", time.localtime()))
@@ -78,20 +78,10 @@ def train_model(_net_train, network, _dataset, _test_dataset, _num_classes):
            t_0 = time.time()
            points = data['data']
            label = data['label']
-            network.set_train(True)
-
-            pred = network(points)
-            pred = ops.Reshape()(pred, (-1, _num_classes))
-            pred_choice = ops.Argmax(axis=1, output_type=mindspore.int32)(pred)
-
-            pred_np = pred_choice.asnumpy()
-            target = ops.Reshape()(label, (-1, 1))[:, 0] - 1
-            target_np = target.asnumpy()
-            correct = np.equal(pred_np, target_np).sum()
            loss = _net_train(points, label)
-            print('Epoch : %d/%d  episode : %d/%d   Loss : %.4f  Accuracy : %f step_time: %.4f' %
+            print('Epoch : %d/%d  episode : %d/%d   Loss : %.4f  step_time: %.4f' %
                  (epoch, args.nepoch, batch_id, steps_per_epoch, np.mean(loss.asnumpy())
-                   , correct.item() / float(args.batchSize * 2500), (time.time() - t_0)))
+                   , (time.time() - t_0)))
            if batch_id % 9 == 0:
                data = valid_data
                points, label = data['point'], data['label']
@@ -110,7 +100,7 @@ def train_model(_net_train, network, _dataset, _test_dataset, _num_classes):
                print('[%d: %d/%d] %s  loss: %f accuracy: %.4f  best_accuracy: %f' %
                      (epoch, batch_id, steps_per_epoch, blue('test'), np.mean(loss.asnumpy())
                       , accuracy, best_accuracy))
-                if accuracy > best_accuracy or accuracy > 0.93:
+                if rank_id == 0 and accuracy > best_accuracy:
                    save_time += 1
                    if accuracy > best_accuracy:
                        best_accuracy = accuracy
@@ -127,8 +117,8 @@ if __name__ == "__main__":
    local_data_url = args.data_url
    local_train_url = args.train_url
    device_num = int(os.getenv("RANK_SIZE", "1"))
-    shard_id = None
-    num_shards = None
+    shard_id = 0
+    num_shards = device_num
    if args.enable_modelarts:
        device_id = int(os.getenv("DEVICE_ID"))
        import moxing as mox
@@ -157,8 +147,10 @@ if __name__ == "__main__":
        mox.file.copy_parallel(src_url=args.data_url, dst_url=local_data_url)
    else:
        # run on the local server
-        context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, device_id=args.device_id)
+        context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target)
        context.set_context(save_graphs=False)
+        if args.device_target == "GPU":
+            context.set_context(enable_graph_kernel=True)
        if device_num > 1:

            args.learning_rate = args.learning_rate * 2
@@ -166,9 +158,10 @@ if __name__ == "__main__":
            context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL,
                                              gradients_mean=True)
            init()
+            shard_id = get_rank()

    if not os.path.exists(local_train_url):
-        os.makedirs(local_train_url)
+        os.makedirs(local_train_url, exist_ok=True)

    dataset_sink_mode = False

@@ -187,8 +180,8 @@ if __name__ == "__main__":
                                  , shuffle=True, num_shards=num_shards, shard_id=shard_id)
    dataset = dataset.batch(args.batchSize, drop_remainder=True)

-    test_dataset = ds.GeneratorDataset(test_dataset_generator, ["point", "label"], shuffle=True
-                                       , num_shards=num_shards, shard_id=shard_id)
+    test_dataset = ds.GeneratorDataset(test_dataset_generator, ["point", "label"], shuffle=False,
+                                       num_shards=1, shard_id=0)
    test_dataset = test_dataset.batch(args.batchSize, drop_remainder=True)

    num_classes = dataset_generator.num_seg_classes
@@ -207,4 +200,4 @@ if __name__ == "__main__":
    net_train = nn.TrainOneStepCell(net_with_loss, optim, sens=1024)

    train_model(_net_train=net_train, network=classifier, _dataset=dataset
-                , _test_dataset=test_dataset, _num_classes=num_classes)
+                , _test_dataset=test_dataset, _num_classes=num_classes, rank_id=shard_id)