diff --git a/utils/hccl_tools/README.md b/utils/hccl_tools/README.md index e9d928349b57242d4e24d89a9c135841d771e815..441c9c68d1167c41ddb40377c8d98063950a847e 100644 --- a/utils/hccl_tools/README.md +++ b/utils/hccl_tools/README.md @@ -2,22 +2,52 @@ MindSpore distributed training launch helper utility that will generate hccl config file. -## Usage +## hccl_tools.py -```python +This script is used to generate a rank_table_file for single server by using `hccl_tool` or read `/etc/hccn.conf`. + +### Usage + +```bash python hccl_tools.py --device_num "[0,8)" ``` output: -```python +```bash hccl_[device_num]p_[which device]_[server_ip].json ``` -## Note +### Note Please note that the Ascend accelerators used must be continuous, such [0,4) means to use four chips 0锛�1锛�2锛�3; [0,1) means to use chip 0; The first four chips are a group, and the last four chips are a group. In addition to the [0,8) chips are allowed, other cross-group such as [3,6) are prohibited. `--visible_devices` means the visible devices according to the software system. Usually used in the virtual system or docker container that makes the device_id dismatch logic_id. `--device_num` uses logic_id. For example "4,5,6,7" means the system has 4 logic chips which are actually the last 4 chips in hardware while `--device_num` could only be set to "[0, 4)" instead of "[4, 8)" `hccl_tools` used `/etc/hccn.conf` to generate rank_table_file. `/etc/hccn.conf` is the configuration file about ascend accelerator resources. If you are using an entirely new server without setting up NIC ip for device, you could refer to this [Chinese guide](https://support.huaweicloud.com/instg-9000-A800_9000_9010/atlastrain_03_0049.html) or this [English guide](https://support.huaweicloud.com/intl/en-us/instg-cli-cann202/atlasrun_03_0051.html) to generate `hccn.conf`. + +## merge_hccl + +This script is used to merge server rank_table_file for single server into one file for the cluster. + +### Usage + +```bash +python merge_hccl.py hccl_1.json hccl_2.json +# or +python merge_hccl.py hccl*.json +``` + +output: + +```bash +hccl_[server_count]s_[rank_size]p.json +``` + +### Note + +Please note that, the server order in the output config file comes from the order of input file list. + +For example, running `python merge_hccl.py hccl_1.json hccl_2.json`. The 8 devices in hccl_1.json will be rank 0~7, and the 8 devices in hccl_2.json will be rank 8~15. + +While running with wildcard, the exact order is not determined, which is decided by the system. Usually this will result in dictionary order just like `ls` command, but we still suggest you check the result carefully if the order does matter in your situation. diff --git a/utils/hccl_tools/hccl_tools.py b/utils/hccl_tools/hccl_tools.py index 2df333b5efca679b3fbe2b2f7d808dd3e6cda591..788d9190165e9a3043d39cc538d6c2f964724bc1 100644 --- a/utils/hccl_tools/hccl_tools.py +++ b/utils/hccl_tools/hccl_tools.py @@ -110,9 +110,23 @@ def main(): # construct hccn_table device_ips: Dict[Any, Any] = {} - for device_id in device_num_list: - ret = os.popen("hccn_tool -i %d -ip -g" % device_id).readlines() - device_ips[str(device_id)] = ret[0].split(":")[1].replace('\n', '') + try: + for device_id in device_num_list: + ret = os.popen("hccn_tool -i %d -ip -g" % device_id).readlines() + device_ips[str(device_id)] = ret[0].split(":")[1].replace('\n', '') + except IndexError: + print("Failed to call hccn_tool, try to read /etc/hccn.conf instead") + try: + with open('/etc/hccn.conf', 'r') as fin: + for hccn_item in fin.readlines(): + if hccn_item.strip().startswith('address_'): + device_id, device_ip = hccn_item.split('=') + device_id = device_id.split('_')[1] + device_ips[device_id] = device_ip.strip() + except OSError: + print("Failed to read /etc/hccn.conf") + raise SystemError("Failed to find information for hccl") + hccn_table = {'version': '1.0', 'server_count': '1', 'server_list': []} diff --git a/utils/hccl_tools/merge_hccl.py b/utils/hccl_tools/merge_hccl.py new file mode 100644 index 0000000000000000000000000000000000000000..1695102fb0f10b8d60a2ec816fc879da7120f16b --- /dev/null +++ b/utils/hccl_tools/merge_hccl.py @@ -0,0 +1,74 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""generate hccl config file script""" +import os +import sys +import json +from argparse import ArgumentParser + + +def parse_args(): + """ + parse args . + + Args: + + Returns: + args. + + Examples: + >>> parse_args() + """ + parser = ArgumentParser(description="Merge several hccl config json files" + "of single server into one config file of the whole cluster") + parser.add_argument("file_list", type=str, nargs="+", help="Hccl file lists") + arg = parser.parse_args() + return arg + +if __name__ == "__main__": + args = parse_args() + print(args.file_list) + + server_count = 0 + json_list = [] + + for f_name in args.file_list: + with open(f_name) as f: + f_json = json.load(f) + json_list.append(f_json) + server_count += int(f_json['server_count']) + + hccl_table = {'version': '1.0', + 'server_count': f'{server_count}', + 'server_list': []} + + rank_id = 0 + for j in json_list: + server_list = j['server_list'] + for server in server_list: + for device in server['device']: + device['rank_id'] = rank_id + rank_id += 1 + hccl_table['server_list'].extend(server_list) + + hccl_table['status'] = 'completed' + + table_path = os.getcwd() + table_name = os.path.join(table_path, + 'hccl_{}s_{}p.json'.format(server_count, rank_id)) + with open(table_name, 'w') as table_fp: + json.dump(hccl_table, table_fp, indent=4) + sys.stdout.flush() + print("Completed: hccl file was save in :", table_name)