diff --git a/official/gnn/gat/README.md b/official/gnn/gat/README.md index 26ebb25b6e8c2d9f619a3d7f79d0c47fbc2ae440..bf7cb220a7604563428c8a89d487545c4cdeaedd 100644 --- a/official/gnn/gat/README.md +++ b/official/gnn/gat/README.md @@ -68,7 +68,7 @@ Note that you can run the scripts based on the dataset mentioned in original pap 鈹溾攢ind.cora.ty 鈹溾攢ind.cora.x 鈹斺攢ind.cora.y - ``` +``` - Generate dataset in mindrecord format for cora or citeseer. @@ -95,7 +95,7 @@ To ultilize the strong computation power of Ascend chip, and accelerate the trai ## [Environment Requirements](#contents) -- Hardware (Ascend) +- Hardware (Ascend/GPU) - Framework - [MindSpore](https://www.mindspore.cn/install/en) - For more information, please check the resources below: @@ -113,6 +113,13 @@ After installing MindSpore via the official website and Dataset is correctly gen bash run_train_ascend.sh [DATASET_NAME] ``` +- running on GPU + + ```bash + # run training example with cora dataset, DATASET_NAME is cora + bash run_train_gpu.sh [DATASET_NAME] [DATASET_DIR] + ``` + - Running on [ModelArts](https://support.huaweicloud.com/modelarts/) ```bash @@ -171,7 +178,8 @@ After installing MindSpore via the official website and Dataset is correctly gen 鈹溾攢README.md 鈹溾攢scripts | 鈹溾攢run_process_data_ascend.sh # Generate dataset in mindrecord format - | 鈹斺攢run_train_ascend.sh # Launch training + | 鈹溾攢run_train_gpu.sh # Launch GPU training + | 鈹斺攢run_train_ascend.sh # Launch Ascend training | 鈹溾攢src | 鈹溾攢dataset.py # Data preprocessing @@ -210,7 +218,7 @@ Parameters for both training and evaluation can be set in default_config.yaml. - running on Ascend - ```python + ```shell bash run_train_ascend.sh [DATASET_NAME] ``` @@ -233,6 +241,31 @@ Parameters for both training and evaluation can be set in default_config.yaml. ... ``` +- running on GPU + + ```shell + bash run_train_gpu.sh [DATASET_NAME] [DATASET_DIR] + ``` + + Training result will be stored in the scripts path, whose folder name begins with "train". You can find the result like the + followings in log. + + ```python + Epoch:0, train loss=1.98709, train acc=0.39286 | val loss=1.98289, val acc=0.11600, time=9.33760, + Epoch:1, train loss=1.97645, train acc=0.35000 | val loss=1.97966, val acc=0.08200, time=0.08991, + Epoch:2, train loss=1.97862, train acc=0.28571 | val loss=1.97775, val acc=0.07400, time=0.08560, + Epoch:3, train loss=1.96789, train acc=0.25000 | val loss=1.97626, val acc=0.08800, time=0.09191, + Epoch:4, train loss=1.96938, train acc=0.26429 | val loss=1.97316, val acc=0.12800, time=0.08851, + ... + Epoch:195, train loss=1.52878, train acc=0.54286 | val loss=1.53312, val acc=0.80400, time=0.08753, + Epoch:196, train loss=1.71779, train acc=0.36429 | val loss=1.53045, val acc=0.80200, time=0.08570, + Epoch:197, train loss=1.59465, train acc=0.51429 | val loss=1.52906, val acc=0.80200, time=0.08549, + Epoch:198, train loss=1.54321, train acc=0.52857 | val loss=1.52826, val acc=0.81200, time=0.08514, + Epoch:199, train loss=1.59484, train acc=0.49286 | val loss=1.52897, val acc=0.81200, time=0.08544, + Test loss=1.5759763, test acc=0.8419999 + ... + ``` + ## Inference Process ### [Export MindIR](#contents) @@ -269,19 +302,19 @@ test acc=0.84199995 ### [Performance](#contents) -| Parameter | GAT | -| ------------------------------------ | ----------------------------------------- | -| Resource | Ascend 910; OS Euler2.8 | -| uploaded Date | 07/05/2021(month/day/year) | -| MindSpore Version | 1.3.0 | -| Dataset | Cora/Citeseer | -| Training Parameter | epoch=200 | -| Optimizer | Adam | -| Loss Function | Softmax Cross Entropy | -| Accuracy | 83.0/72.5 | -| Speed | 0.195s/epoch | -| Total time | 39s | -| Scripts | [GAT Script](https://gitee.com/mindspore/models/tree/master/official/gnn/gat) | +| Parameter | GAT | GAT | +| ------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | +| Resource | Ascend 910; OS Euler2.8 | Tesla V100-PCIE | +| uploaded Date | 07/05/2021(month/day/year) | 20/10/2021(month/day/year) | +| MindSpore Version | 1.3.0 | 1.5.0-rc1 | +| Dataset | Cora/Citeseer | Cora/Citeseer | +| Training Parameter | epoch=200 | epoch=200 | +| Optimizer | Adam | Adam | +| Loss Function | Softmax Cross Entropy | Softmax Cross Entropy | +| Accuracy | 83.0/72.5 | 83.0/72.5 | +| Speed | 0.195s/epoch | 0.086s/epoch | +| Total time | 39s | 17.2s | +| Scripts | [GAT Script](https://gitee.com/mindspore/models/tree/master/official/gnn/gat) | [GAT Script](https://gitee.com/mindspore/models/tree/master/official/gnn/gat) | ## [Description of random situation](#contents) diff --git a/official/gnn/gat/README_CN.md b/official/gnn/gat/README_CN.md index 077b5a29a6d68b143eeaa387d7524f8d302059e8..1cb31672491aae8486e88c4f209e8dd02e9f86de 100644 --- a/official/gnn/gat/README_CN.md +++ b/official/gnn/gat/README_CN.md @@ -94,7 +94,7 @@ ## 鐜瑕佹眰 -- 纭欢锛圓scend锛� +- 纭欢锛圓scend/GPU锛� - 妗嗘灦 - [MindSpore](https://www.mindspore.cn/install) - 濡傞渶鏌ョ湅璇︽儏锛岃鍙傝濡備笅璧勬簮锛� @@ -107,11 +107,18 @@ - Ascend澶勭悊鍣ㄧ幆澧冭繍琛� - ```text + ```shell # 浣跨敤Cora鏁版嵁闆嗚繍琛岃缁冪ず渚嬶紝DATASET_NAME涓篶ora bash run_train_ascend.sh [DATASET_NAME] ``` +-GPU澶勭悊鍣ㄧ幆澧冭繍琛� + + ```shell + # 浣跨敤Cora鏁版嵁闆嗚繍琛岃缁冪ず渚嬶紝DATASET_NAME涓篶ora + bash run_train_gpu.h [DATASET_NAME] [DATASET_DIR] + ``` + - 鍦� ModelArts 杩涜璁粌 (濡傛灉浣犳兂鍦╩odelarts涓婅繍琛岋紝鍙互鍙傝€冧互涓嬫枃妗� [modelarts](https://support.huaweicloud.com/modelarts/)) ```bash @@ -168,7 +175,8 @@ 鈹溾攢README.md 鈹溾攢scripts | 鈹溾攢run_process_data_ascend.sh # 鐢熸垚MindRecord鏍煎紡鐨勬暟鎹泦 - | 鈹斺攢run_train_ascend.sh # 鍚姩璁粌 + | 鈹溾攢run_train_gpu.sh # 鍚姩GPU璁粌 + | 鈹斺攢run_train_ascend.sh # 鍚姩Ascend璁粌 | 鈹溾攢src | 鈹溾攢dataset.py # 鏁版嵁棰勫鐞� @@ -207,7 +215,7 @@ - Ascend澶勭悊鍣ㄧ幆澧冭繍琛� - ```python + ```shell bash run_train_ascend.sh [DATASET_NAME] ``` @@ -230,6 +238,31 @@ ... ``` +- GPU澶勭悊鍣ㄧ幆澧冭繍琛� + + ```shell + bash run_train_gpu.sh [DATASET_NAME] [DATASET_DIR] + ``` + + 璁粌缁撴灉灏嗕繚瀛樺湪鑴氭湰璺緞涓嬶紝鏂囦欢澶瑰悕绉颁互鈥渢rain鈥濆紑澶淬€傛偍鍙湪鏃ュ織涓壘鍒扮粨鏋� + 锛屽涓嬫墍绀恒€� + + ```python + Epoch:0, train loss=1.98709, train acc=0.39286 | val loss=1.98289, val acc=0.11600, time=9.33760, + Epoch:1, train loss=1.97645, train acc=0.35000 | val loss=1.97966, val acc=0.08200, time=0.08991, + Epoch:2, train loss=1.97862, train acc=0.28571 | val loss=1.97775, val acc=0.07400, time=0.08560, + Epoch:3, train loss=1.96789, train acc=0.25000 | val loss=1.97626, val acc=0.08800, time=0.09191, + Epoch:4, train loss=1.96938, train acc=0.26429 | val loss=1.97316, val acc=0.12800, time=0.08851, + ... + Epoch:195, train loss=1.52878, train acc=0.54286 | val loss=1.53312, val acc=0.80400, time=0.08753, + Epoch:196, train loss=1.71779, train acc=0.36429 | val loss=1.53045, val acc=0.80200, time=0.08570, + Epoch:197, train loss=1.59465, train acc=0.51429 | val loss=1.52906, val acc=0.80200, time=0.08549, + Epoch:198, train loss=1.54321, train acc=0.52857 | val loss=1.52826, val acc=0.81200, time=0.08514, + Epoch:199, train loss=1.59484, train acc=0.49286 | val loss=1.52897, val acc=0.81200, time=0.08544, + Test loss=1.5759763, test acc=0.8419999 + ... + ``` + ## 鎺ㄧ悊杩囩▼ ### [瀵煎嚭MindIR](#contents) @@ -266,19 +299,19 @@ test acc=0.84199995 ### 鎬ц兘 -| 鍙傛暟 | GAT | -| ------------------------------------ | ----------------------------------------- | -| 璧勬簮 | Ascend 910锛涚郴缁� Euler2.8 | -| 涓婁紶鏃ユ湡 | 2021-07-05 | -| MindSpore鐗堟湰 | 1.3.0 | -| 鏁版嵁闆� | Cora/Citeseer | -| 璁粌鍙傛暟 | epoch=200 | -| 浼樺寲鍣� | Adam | -| 鎹熷け鍑芥暟 | Softmax浜ゅ弶鐔� | -| 鍑嗙‘鐜� | 83.0/72.5 | -| 閫熷害 | 0.195s/epoch | -| 鎬绘椂闀� | 39s | -| 鑴氭湰 | <https://gitee.com/mindspore/models/tree/master/official/gnn/gat> | +| 鍙傛暟 | GAT | GAT | +| ------------------------------------ | ----------------------------------------- | ----------------------------------------- | +| 璧勬簮 | Ascend 910锛涚郴缁� Euler2.8 | Tesla V100-PCIE | +| 涓婁紶鏃ユ湡 | 2021-07-05 | 2021-10-20 | +| MindSpore鐗堟湰 | 1.3.0 | 1.5.0-rc1 | +| 鏁版嵁闆� | Cora/Citeseer | Cora/Citeseer | +| 璁粌鍙傛暟 | epoch=200 | epoch=200 | +| 浼樺寲鍣� | Adam | Adam | +| 鎹熷け鍑芥暟 | Softmax浜ゅ弶鐔� | Softmax浜ゅ弶鐔� | +| 鍑嗙‘鐜� | 83.0/72.5 | 83.0/72.5 | +| 閫熷害 | 0.195s/epoch | 0.086s/epoch | +| 鎬绘椂闀� | 39s | 17.2s | +| 鑴氭湰 | [GAT鑴氭湰](https://gitee.com/mindspore/models/tree/master/official/gnn/gat) | [GAT鑴氭湰](https://gitee.com/mindspore/models/tree/master/official/gnn/gat) | ## 闅忔満鎯呭喌璇存槑 diff --git a/official/gnn/gat/scripts/run_train_ascend.sh b/official/gnn/gat/scripts/run_train_ascend.sh index 08bf96a849f33e4b7902fce39c6f3dd813ae9d6e..6ab5afaccb5d475ea168af6b78ea1fcf7f57d570 100644 --- a/official/gnn/gat/scripts/run_train_ascend.sh +++ b/official/gnn/gat/scripts/run_train_ascend.sh @@ -46,11 +46,11 @@ echo "start training for device $DEVICE_ID" if [ $DATASET_NAME == cora ] then - python train.py --data_dir=$DATA_DIR/$DATASET_NAME &> log & + python train.py --data_dir=$DATA_DIR/$DATASET_NAME --device_target="Ascend" &> log & fi if [ $DATASET_NAME == citeseer ] then - python train.py --data_dir=$DATA_DIR/$DATASET_NAME --train_nodes_num=120 &> log & + python train.py --data_dir=$DATA_DIR/$DATASET_NAME --train_nodes_num=120 --device_target="Ascend" &> log & fi cd .. diff --git a/official/gnn/gat/scripts/run_train_gpu.sh b/official/gnn/gat/scripts/run_train_gpu.sh new file mode 100644 index 0000000000000000000000000000000000000000..7b17cd34afb861e0a23aa83a84e40feeb4697f75 --- /dev/null +++ b/official/gnn/gat/scripts/run_train_gpu.sh @@ -0,0 +1,50 @@ +#!/bin/bash +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +if [ $# != 2 ] +then + echo "Usage: sh run_train_gpu.sh [DATASET_NAME] [DATA_DIR]" +exit 1 +fi + +DATASET_NAME=$1 +DATA_DIR=$2 +echo $DATASET_NAME + +if [ -d "train" ]; +then + rm -rf ./train +fi +mkdir ./train +cp ../*.py ./train +cp ../*.yaml ./train +cp *.sh ./train +cp -r ../src ./train +cd ./train || exit +env > env.log +echo "start training" + + +if [ $DATASET_NAME == cora ] +then + python train.py --data_dir=$DATA_DIR/$DATASET_NAME --dataset=$DATASET_NAME --device_target="GPU" &> log & +fi + +if [ $DATASET_NAME == citeseer ] +then + python train.py --data_dir=$DATA_DIR/$DATASET_NAME --train_nodes_num=120 --dataset=$DATASET_NAME --device_target="GPU" &> log & +fi +cd .. diff --git a/official/gnn/gat/train.py b/official/gnn/gat/train.py index 5bd953541c78e902d547d060a0958d5590048279..ba6c60042e77e1c67c78f32d39bec64bcda6098c 100644 --- a/official/gnn/gat/train.py +++ b/official/gnn/gat/train.py @@ -1,4 +1,4 @@ -# Copyright 2020 Huawei Technologies Co., Ltd +# Copyright 2021 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,18 +14,21 @@ # ============================================================================ """Test train gat""" import os +import time +import numpy as np + from src.model_utils.config import config from src.model_utils.moxing_adapter import moxing_wrapper from src.dataset import load_and_process from src.gat import GAT from src.utils import LossAccuracyWrapper, TrainGAT -import numpy as np import mindspore.context as context from mindspore.train.serialization import save_checkpoint, load_checkpoint from mindspore import Tensor + def modelarts_pre_process(): config.data_dir = os.path.join(config.data_dir, config.dataset) @@ -36,7 +39,7 @@ def gnn_train(): if not os.path.exists("ckpts"): os.mkdir("ckpts") context.set_context(mode=context.GRAPH_MODE, - device_target="Ascend", + device_target=config.device_target, save_graphs=False) # train parameters hid_units = config.hid_units @@ -60,6 +63,8 @@ def gnn_train(): attn_drop=config.attn_dropout, ftr_drop=config.feature_dropout) gat_net.add_flags_recursive(fp16=True) + if config.dataset == "citeseer" and config.device_target == "GPU": + gat_net.add_flags_recursive(fp32=True) feature = Tensor(feature) biases = Tensor(biases) @@ -69,28 +74,26 @@ def gnn_train(): y_val, eval_mask, l2_coeff) - train_net = TrainGAT(gat_net, num_class, y_train, train_mask, lr, l2_coeff) - train_net.set_train(True) val_acc_max = 0.0 val_loss_min = np.inf for _epoch in range(num_epochs): + epoch_start = time.time() train_result = train_net(feature, biases) train_loss = train_result[0].asnumpy() train_acc = train_result[1].asnumpy() - eval_result = eval_net(feature, biases) eval_loss = eval_result[0].asnumpy() eval_acc = eval_result[1].asnumpy() - - print("Epoch:{}, train loss={:.5f}, train acc={:.5f} | val loss={:.5f}, val acc={:.5f}".format( - _epoch, train_loss, train_acc, eval_loss, eval_acc)) + epoch_time = time.time() - epoch_start + print("Epoch:{}, train loss={:.5f}, train acc={:.5f} | val loss={:.5f}, val acc={:.5f}, time={:.5f},".format( + _epoch, train_loss, train_acc, eval_loss, eval_acc, epoch_time)) if eval_acc >= val_acc_max or eval_loss < val_loss_min: if eval_acc >= val_acc_max and eval_loss < val_loss_min: val_acc_model = eval_acc @@ -116,7 +119,8 @@ def gnn_train(): ftr_drop=0.0) load_checkpoint("ckpts/gat.ckpt", net=gat_net_test) gat_net_test.add_flags_recursive(fp16=True) - + if config.dataset == "citeseer" and config.device_target == "GPU": + gat_net_test.add_flags_recursive(fp32=True) test_net = LossAccuracyWrapper(gat_net_test, num_class, y_test,