diff --git a/official/cv/FCN8s/scripts/run_train.sh b/official/cv/FCN8s/scripts/run_train.sh index 7f0181f2a51ac8329772fe15070ba44f2e194a53..bbacbe6057297b3d089a355a1efaae960364ad11 100644 --- a/official/cv/FCN8s/scripts/run_train.sh +++ b/official/cv/FCN8s/scripts/run_train.sh @@ -33,6 +33,7 @@ export RANK_SIZE=$1 RANK_TABLE_FILE=$(realpath $2) export RANK_TABLE_FILE echo "RANK_TABLE_FILE=${RANK_TABLE_FILE}" +export HCCL_ALGO="level0:fullmesh;level1:fullmesh" export SERVER_ID=0 rank_start=$((DEVICE_NUM * SERVER_ID)) diff --git a/official/cv/FCN8s/src/loss/loss.py b/official/cv/FCN8s/src/loss/loss.py index ed3d85da55e84e97e926e160a7e1098b4a675da1..6665eaa745e98ed5bd5abd45bb97baacfd639be0 100644 --- a/official/cv/FCN8s/src/loss/loss.py +++ b/official/cv/FCN8s/src/loss/loss.py @@ -35,6 +35,7 @@ class SoftmaxCrossEntropyLoss(nn.Cell): self.div = P.RealDiv() self.transpose = P.Transpose() self.reshape = P.Reshape() + self.ce.softmax_cross_entropy.shard(((device_num, 1), (device_num, 1))) self.transpose.shard(((1, 1, 1, device_num),)) def construct(self, logits, labels): diff --git a/official/cv/FCN8s/train.py b/official/cv/FCN8s/train.py index 6a8cc4eebb3efa286b8c7f363105bc60d1538080..bc95324b1d1928a4d85b9ac99af9468461fd67ed 100644 --- a/official/cv/FCN8s/train.py +++ b/official/cv/FCN8s/train.py @@ -23,6 +23,7 @@ from mindspore.train.callback import ModelCheckpoint, CheckpointConfig from mindspore.train.serialization import load_checkpoint, load_param_into_net from mindspore.communication.management import init, get_rank, get_group_size from mindspore.train.callback import LossMonitor, TimeMonitor +from mindspore.parallel import set_algo_parameters from mindspore.train.loss_scale_manager import FixedLossScaleManager from mindspore.common import set_seed from src.data import dataset as data_generator @@ -63,10 +64,15 @@ def train(): config.rank = 0 config.group_size = 1 if device_num > 1: - parallel_mode = ParallelMode.DATA_PARALLEL - if config.parallel_mode in ParallelMode.MODE_LIST: - parallel_mode = config.parallel_mode - context.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=device_num) + parallel_mode = config.parallel_mode if config.parallel_mode in ParallelMode.MODE_LIST \ + else ParallelMode.DATA_PARALLEL + if config.parallel_mode == "sharding_propagation": + context.set_auto_parallel_context(parallel_mode=ParallelMode.AUTO_PARALLEL, enable_alltoall=True, + search_mode="sharding_propagation", device_num=device_num, + gradients_mean=True, strategy_ckpt_save_file='strategy.ckpt') + set_algo_parameters(elementwise_op_strategy_follow=False, fully_use_devices=False) + else: + context.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=device_num) init() config.rank = get_rank() config.group_size = get_group_size() @@ -88,8 +94,7 @@ def train(): dataset = dataset.get_dataset(repeat=1) net = FCN8s(n_class=config.num_classes) - if context.get_auto_parallel_context("parallel_mode") in [ParallelMode.SEMI_AUTO_PARALLEL, - ParallelMode.AUTO_PARALLEL]: + if config.parallel_mode in [ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL]: net.set_model_parallel_shard_strategy(device_num) loss_ = loss.SoftmaxCrossEntropyLoss(config.num_classes, config.ignore_label, device_num=device_num) diff --git a/official/recommend/wide_and_deep/default_config.yaml b/official/recommend/wide_and_deep/default_config.yaml index c25e533b490a61cbfc10985b7a4a1e0d07860bac..b99add08de86a84edc07322a96eb8c2cd2dad5fd 100644 --- a/official/recommend/wide_and_deep/default_config.yaml +++ b/official/recommend/wide_and_deep/default_config.yaml @@ -32,6 +32,7 @@ dataset_type: "mindrecord" parameter_server: 0 field_slice: False sparse: False +use_sp: True deep_table_slice_mode: "column_slice" # WideDeepConfig @@ -106,6 +107,7 @@ dataset_type: "tfrecord/mindrecord/hd5" parameter_server: "Open parameter server of not" field_slice: "Enable split field mode or not" sparse: "Enable sparse or not" +use_sp: "Whether to use sharding_propagation instead of semi_parallel mode" deep_table_slice_mode: "column_slice/row_slice" epochs: "Total train epochs" diff --git a/official/recommend/wide_and_deep/script/run_sharding_propagation.sh b/official/recommend/wide_and_deep/script/run_sharding_propagation.sh new file mode 100644 index 0000000000000000000000000000000000000000..7d8ed037771fb382325abb0bc2c51e282d1313e4 --- /dev/null +++ b/official/recommend/wide_and_deep/script/run_sharding_propagation.sh @@ -0,0 +1,26 @@ +#!/bin/bash +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +# bash run_sharding_propagation.sh RANK_SIZE HOSTFILE DATASET EPOCH_SIZE + +self_path=$(cd "$(dirname "$0")" || exit; pwd) +RANK_SIZE=$1 +HOSTFILE=$2 +DATASET=$3 +EPOCH_SIZE=$4 + +mpirun --allow-run-as-root -x PATH -x NCCL_IB_HCA -x NCCL_SOCKET_IFNAME -x LD_LIBRARY_PATH -x PYTHONPATH -x NCCL_DEBUG -x GLOG_v -n $RANK_SIZE --hostfile $HOSTFILE --output-filename log_output --merge-stderr-to-stdout \ + python -s ${self_path}/../train_and_eval_auto_parallel.py --data_path=$DATASET --epochs=$EPOCH_SIZE --sparse=True > train_log.txt 2>&1 & diff --git a/official/recommend/wide_and_deep/src/wide_and_deep.py b/official/recommend/wide_and_deep/src/wide_and_deep.py index 86c6c80998588de2546e3e16717960f7d451ef26..15e2396fc1e09f9fc51f5cf8fcb00cbb2edfec61 100644 --- a/official/recommend/wide_and_deep/src/wide_and_deep.py +++ b/official/recommend/wide_and_deep/src/wide_and_deep.py @@ -178,10 +178,15 @@ class WideDeepModel(nn.Cell): if config.deep_table_slice_mode == "column_slice": self.deep_embeddinglookup = nn.EmbeddingLookup(config.vocab_size, self.emb_dim, target=target, slice_mode=nn.EmbeddingLookup.TABLE_COLUMN_SLICE) - self.dense_layer_1.dropout.dropout.shard(((1, get_group_size()),)) - self.dense_layer_1.matmul.shard(((1, get_group_size()), (get_group_size(), 1))) + if config.use_sp: + self.dense_layer_1.matmul.shard(((1, get_group_size()), (get_group_size(), 1))) + self.dense_layer_1.bias_add.shard(((get_group_size(), 1), (1,))) + self.deep_mul.shard(((1, 1, get_group_size()), (1, 1, 1))) + else: + self.dense_layer_1.dropout.dropout.shard(((1, get_group_size()),)) + self.dense_layer_1.matmul.shard(((1, get_group_size()), (get_group_size(), 1))) + self.deep_mul.shard(((1, 1, get_group_size()), (1, 1, 1))) self.dense_layer_1.matmul.add_prim_attr("field_size", self.field_size) - self.deep_mul.shard(((1, 1, get_group_size()), (1, 1, 1))) self.deep_reshape.add_prim_attr("skip_redistribution", True) else: self.deep_embeddinglookup = nn.EmbeddingLookup(config.vocab_size, self.emb_dim, target=target, diff --git a/official/recommend/wide_and_deep/train_and_eval_auto_parallel.py b/official/recommend/wide_and_deep/train_and_eval_auto_parallel.py index 9f8b333b717e7547336a2139251ef8489326a707..d076d332b7bec876c01d9aa6aa89800b16bcaf19 100644 --- a/official/recommend/wide_and_deep/train_and_eval_auto_parallel.py +++ b/official/recommend/wide_and_deep/train_and_eval_auto_parallel.py @@ -21,6 +21,7 @@ import mindspore.dataset as ds from mindspore import Model, context from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor from mindspore.context import ParallelMode +from mindspore.parallel import set_algo_parameters from mindspore.communication.management import get_rank, get_group_size, init from mindspore.nn.wrap.cell_wrapper import VirtualDatasetCellTriple @@ -157,8 +158,14 @@ def train_wide_and_deep(): init() context.set_context(save_graphs_path='./graphs_of_device_id_' + str(get_rank()), save_graphs=True) if cfg.sparse: - context.set_auto_parallel_context( - parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL, gradients_mean=True) + if cfg.use_sp: + context.set_auto_parallel_context(parallel_mode=ParallelMode.AUTO_PARALLEL, enable_alltoall=True, + search_mode="sharding_propagation", gradients_mean=True, + strategy_ckpt_save_file='strategy.ckpt') + set_algo_parameters(elementwise_op_strategy_follow=False, fully_use_devices=False) + else: + context.set_auto_parallel_context( + parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL, gradients_mean=True) else: context.set_auto_parallel_context( parallel_mode=ParallelMode.AUTO_PARALLEL, gradients_mean=True)