!3745 modify delf Google Landmarks Dataset v2 dataset download shell

Merge pull request !3745 from anzhengqi/delf-download-shell

!3745 modify delf Google Landmarks Dataset v2 dataset download shell
Merge pull request !3745 from anzhengqi/delf-download-shell
70166a7d · i-robot · Gitee · 914cf156 · 0973072e · 70166a7d
Unverified Commit 70166a7d authored 2 years ago by i-robot Committed by Gitee 2 years ago
--- a/.jenkins/check/config/filter_linklint.txt
+++ b/.jenkins/check/config/filter_linklint.txt
 http://dl.yf.io/dla/models/imagenet/dla34-ba72cf86.pth
 https://github.com/mcordts/cityscapesScripts/blob/master/cityscapesscripts/helpers/labels.py_key
 https://persagen.com/files/misc/wang2014knowledge.pdf
+https://s3.amazonaws.com/google-landmark/metadata
+https://s3.amazonaws.com/google-landmark/md5sum
--- a/official/cv/retinanet/src/lr_schedule.py
+++ b/official/cv/retinanet/src/lr_schedule.py
@@ -45,7 +45,7 @@ def get_lr(global_step, lr_init, lr_end, lr_max, warmup_epochs1, warmup_epochs2,
    warmup_steps4 = warmup_steps3 + steps_per_epoch * warmup_epochs4
    warmup_steps5 = warmup_steps4 + steps_per_epoch * warmup_epochs5
    step_radio = [1e-4, 1e-3, 1e-2, 0.1]
-    if hasattr(config, finetune) and config.finetune:
+    if hasattr(config, "finetune") and config.finetune:
        step_radio = [1e-4, 1e-2, 0.1, 1]
    for i in range(total_steps):
        if i < warmup_steps1:

--- a/research/cv/delf/README_CN.md
+++ b/research/cv/delf/README_CN.md
@@ -98,9 +98,27 @@

  ```shell
  # Google Landmarks Dataset v2 训练集下载以及转化为mindrecord文件
-  # 【注】请准备至少1.1TB的存储空间，若空间不足可以将可选参数[NEED_ROMOVE_TAR]设置为'y'，设置后占用约633G存储空间
-  bash scripts/download_gldv2.sh 500 [DATASET_PATH] [NEED_ROMOVE_TAR]
-  # example: bash scripts/download_gldv2.sh 500 /home/gldv2 y
+
+  # 【注】一共要下载4个csv文件，500个tar文件，500个md5文件，共占用约633G存储空间，请预留足够空间
+  # 下载数据集时间较长，并且由于网络波动等原因存在，一次执行可能会下载失败，download_gldv2.sh的三个参数分别代码下载的数据集编号最小值，
+  # 最大值，和保存路径
+  bash scripts/download_gldv2.sh 0 499 [DATASET_PATH]
+  # example: bash scripts/download_gldv2.sh 0 499 /home/gldv2
+  # 下载完成后，可以比较下载得到的tar文件的md5值和md5文件，若一致，表明下载正确，否则下载错误，需要重新下载．
+  # 重新下载时，修改前两个参数指定要下载的文件，例如指定'1, 1'表示下载images_001.tar，另外，train.csv, train_clean.csv,
+  # train_attribution.csv, train_label_to_category.csv若已下载成功，可参考脚本注释进行适当修改
+
+  cd [DATASET_PATH]/train
+  # 对下载得到的500个tar文件解压
+  tar xvf images_xxx.tar # 000, 001, 002, 003, ...
+
+  python3 src/build_image_dataset.py \
+  --train_csv_path=[DATASET_PATH]/train/train.csv \
+  --train_clean_csv_path=[DATASET_PATH]/train/train_clean.csv \
+  --train_directory=[DATASET_PATH]/train/*/*/*/ \
+  --output_directory=[DATASET_PATH]/mindrecord/ \
+  --num_shards=128 \
+  --validation_split_size=0.2

  # Oxford5k和Paris6k以及它们对应的ground truth文件下载
  bash scripts/download_oxf.sh [DATASET_PATH]
@@ -276,12 +294,10 @@
 使用以下命令可以下载`Google Landmarks Dataset v2`数据集的训练集，并且自动提取它的clean子集（具体定义参考[数据集](#数据集)中数据集对应的论文）转化为mindrecord格式：

 ```shell
-bash scripts/download_gldv2.sh 500 [DATASET_PATH] [NEED_ROMOVE_TAR]
-# example: bash scripts/download_gldv2.sh 500 /home/gldv2 y
+bash scripts/download_gldv2.sh 0 499 [DATASET_PATH]
+# example: bash scripts/download_gldv2.sh 0 499 /home/gldv2
 ```

- 请准备至少1.1TB的存储空间，若空间不足可以将可选参数`[NEED_ROMOVE_TAR]`设置为`y`，这样会在解压完tar包后将tar包删除，设置后数据集占用约633G存储空间。如果空间仍然不够，可以尝试在`src/build_image_dataset.py`中，搜索`os.remove`语句，将对应语句从注释中恢复过来，如此一来，在转化mindrecord格式的过程中，会边转化边将源图像删除，更改该设置后数据集约占用450G存储空间。转化完成后，如果空间紧缺，那么可以直接将`train`目录删除，剩下`mindrecord`目录文件约占103G。
-
 目录以及说明：

 ```shell

--- a/research/cv/delf/scripts/download_gldv2.sh
+++ b/research/cv/delf/scripts/download_gldv2.sh
 #!/bin/bash
-# Copyright 2021 Huawei Technologies Co., Ltd
+# Copyright 2022 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,234 +16,46 @@

 # This script downloads the Google Landmarks v2 dataset TRAIN split. To download the dataset
 # run the script like in the following example:
-#   bash download_gldv2.sh 500 [DATASET_PATH] [NEED_ROMOVE_TAR]
+#   bash download_gldv2.sh 0 499 [DATASET_PATH]
 # 
 # The script takes the following parameters, in order:
 # - number of image files from the TRAIN split to download (maximum 500)
 # - path for dataset

-if [[ $# -lt 2 || $# -gt 3 ]]
+if [[ $# -lt 3 ]]
 then
-    echo "Usage: bash download_gldv2.sh 500 [DATASET_PATH] [NEED_ROMOVE_TAR]
-    NEED_ROMOVE_TAR is optional, whether remove tar after extracting the images, choices: 'y' and 'n', default 'n' "
+    echo "Usage: bash download_gldv2.sh [BEGIN_IDX] [END_IDX] [DATASET_PATH]"
 exit 1
 fi

-need_remove_tar="n"
-if [ $# == 3 ]
-then
-  if [ "$3" == "y" ] || [ "$3" == "n" ];then
-      need_remove_tar=$3
-  else
-    echo "weather need remove tar or not, it's value must be in [y, n]"
-    exit 1
-  fi
-fi
-
-image_files_train=$1 # Number of image files to download from the TRAIN split
-dataset_root_folder=$2
-
+begin_idx=$1
+end_idx=$2
+dataset_root_folder=$3
 split="train"

 metadata_url="https://s3.amazonaws.com/google-landmark/metadata"
-csv_train=("${metadata_url}/train.csv" "${metadata_url}/train_clean.csv" "${metadata_url}/train_attribution.csv" "${metadata_url}/train_label_to_category.csv")
-export csv_train
-
 images_tar_file_base_url="https://s3.amazonaws.com/google-landmark"
 images_md5_file_base_url="https://s3.amazonaws.com/google-landmark/md5sum"
-num_processes=8
-
-make_folder() {
-  # Creates a folder and checks if it exists. Exits if folder creation fails.
-  local folder=$1
-  if [ -d "${folder}" ]; then
-    echo "Folder ${folder} already exists. Skipping folder creation."
-  else
-    echo "Creating folder ${folder}."
-    if mkdir -p ${folder}; then
-      echo "Successfully created folder ${folder}."
-    else
-      echo "Failed to create folder ${folder}. Exiting."
-      exit 1
-    fi
-  fi
-}
-
-download_file() {
-  # Downloads a file from an URL into a specified folder.
-  local file_url=$1
-  local folder=$2
-  # local file_path="${folder}/`basename ${file_url}`"
-  echo "Downloading file ${file_url} to folder ${folder}."
-  pushd . > /dev/null
-  cd ${folder}
-  curl -Os -C - --retry 10 ${file_url}
-  popd > /dev/null
-}
-
-validate_md5_checksum() {
-  # Validate the MD5 checksum of a downloaded file.
-  local content_file=$1
-  local md5_file=$2
-  echo "Checking MD5 checksum of file ${content_file} against ${md5_file}"
-  if [[ "${OSTYPE}" == "linux-gnu" ]]; then
-    content_md5=`md5sum ${content_file}`
-  elif [[ "${OSTYPE}" == "darwin"* ]]; then
-    content_md5=`md5 -r "${content_file}"`
-  fi
-  content_md5=`cut -d' ' -f1<<<"${content_md5}"`
-  expected_md5=`cut -d' ' -f1<<<cat "${md5_file}"`
-  if [[ "$content_md5" != "" && "$content_md5" = "$expected_md5" ]]; then
-    echo "Check ${content_file} passed."
-    return 0
-  else
-    echo "Check failed. MD5 checksums don't match. Exiting."
-    return 1
-  fi
-}
-
-extract_tar_file() {
-  # Extracts the content of a tar file to a specified folder.
-  local tar_file=$1
-  local folder=$2
-  echo "Extracting file ${tar_file} to folder ${folder}"
-  tar -C ${folder} -xf ${tar_file}
-  if [ $need_remove_tar == "y" ]; then
-    rm -rf ${tar_file}
-  fi
-}
-
-download_image_file() {
-  # Downloads one image file of a split and untar it.
-  local split=$1
-  local idx=`printf "%03g" $2`
-  local split_folder=$3
-
-  local images_md5_file=md5.images_${idx}.txt
-  local images_md5_file_url=${images_md5_file_base_url}/${split}/${images_md5_file}
-  local images_md5_file_path=${split_folder}/${images_md5_file}
-
-  download_file "${images_md5_file_url}" "${split_folder}"
-
-  local images_tar_file=images_${idx}.tar
-  local images_tar_file_url=${images_tar_file_base_url}/${split}/${images_tar_file}
-  local images_tar_file_path=${split_folder}/${images_tar_file}
-
-  download_file "${images_tar_file_url}" "${split_folder}"
-  if validate_md5_checksum "${images_tar_file_path}" "${images_md5_file_path}" ; then
-    echo "${images_tar_file_path} error for wrong md5 file"
-    download_file "${images_md5_file_url}" "${split_folder}"
-    validate_md5_checksum "${images_tar_file_path}" "${images_md5_file_path}"
-  fi
-  #extract_tar_file "${images_tar_file_path}" "${split_folder}"
-  
-}
-
-check_image_file() {
-  # Downloads one image file of a split and untar it.
-  local split=$1
-  local idx=`printf "%03g" $2`
-  local split_folder=$3
-
-  local images_md5_file=md5.images_${idx}.txt
-  local images_md5_file_url=${images_md5_file_base_url}/${split}/${images_md5_file}
-  local images_md5_file_path=${split_folder}/${images_md5_file}
-  if  ! [ -f "${images_md5_file_path}" ]; then
-    echo "${images_md5_file_path} not found!"
-    download_file "${images_md5_file_url}" "${split_folder}"
-  else 
-    local filesize=`wc -c < "${images_md5_file_path}" `
-    echo "md5file size is ${filesize}"
-    if [[ "${filesize}" -lt 40 ]]; then
-      echo "${images_md5_file_path} not complete"
-      download_file "${images_md5_file_url}" "${split_folder}"
-    fi
-  fi
-
-  local images_tar_file=images_${idx}.tar
-  local images_tar_file_url=${images_tar_file_base_url}/${split}/${images_tar_file}
-  local images_tar_file_path=${split_folder}/${images_tar_file}
-  if ! [ -f "${images_tar_file_path}" ]; then
-    echo "${images_tar_file_path} not found!"
-    download_file "${images_tar_file_url}" "${split_folder}"
-    if validate_md5_checksum "${images_tar_file_path}" "${images_md5_file_path}" ; then
-      echo "${images_tar_file_path} error for wrong md5 file"
-      download_file "${images_md5_file_url}" "${split_folder}"
-      validate_md5_checksum "${images_tar_file_path}" "${images_md5_file_path}"
-    fi
-    
-  else
-    if ! validate_md5_checksum "${images_tar_file_path}" "${images_md5_file_path}" ; then
-      echo "${images_tar_file_path} not complete "
-      download_file "${images_tar_file_url}" "${split_folder}"
-      validate_md5_checksum "${images_tar_file_path}" "${images_md5_file_path}"
-    fi
-  fi
-  extract_tar_file "${images_tar_file_path}" "${split_folder}"
-}
-
-
-download_image_files() {
-  # Downloads all image files of a split and untars them.
-  local split=$1
-  local split_folder=$2
-  local max_idx=$(expr ${image_files_train} - 1)
-  echo "Downloading ${image_files_train} files form the split ${split} in the folder ${split_folder}."
-  for i in $(seq 0 ${num_processes} ${max_idx}); do
-    local curr_max_idx=$(expr ${i} + ${num_processes} - 1)
-    local last_idx=$((${curr_max_idx}>${max_idx}?${max_idx}:${curr_max_idx}))
-    for j in $(seq ${i} 1 ${last_idx}); do download_image_file "${split}" "${j}" "${split_folder}" & done
-    wait
-  done
-}
-
-check_image_files() {
-  # Downloads all image files of a split and untars them.
-  local split=$1
-  local split_folder=$2
-  local max_idx=$(expr ${image_files_train} - 1)
-  echo "Downloading ${image_files_train} files form the split ${split} in the folder ${split_folder}."
-  for i in $(seq 0 1 ${max_idx}); do
-    local curr_max_idx=$(expr ${i} + 1 - 1)
-    local last_idx=$((${curr_max_idx}>${max_idx}?${max_idx}:${curr_max_idx}))
-    for j in $(seq ${i} 1 ${last_idx}); do check_image_file "${split}" "${j}" "${split_folder}" & done
-    wait
-  done
-}
-
-download_csv_files() {
-  # Downloads all medatada CSV files of a split.
-  local split=$1
-  local split_folder=$2
-  local csv_list="csv_${split}[*]"
-  for csv_file in ${!csv_list}; do
-    download_file "${csv_file}" "${split_folder}"
-  done
-}
-
-download_split() {
-  # Downloads all artifacts, metadata CSV files and image files of a single split.
-  local split=$1
-  local split_folder=${dataset_root_folder}/${split}
-  make_folder "${split_folder}"
-  download_csv_files "${split}" "${split_folder}"
-  download_image_files "${split}" "${split_folder}"
-  check_image_files "${split}" "${split_folder}"
-}
-
-download_all_splits() {
-  # Downloads all artifacts, metadata CSV files and image files of all splits.
-  make_folder "${dataset_root_folder}"
-  download_split "${split}"
-}
-
-download_all_splits
-python3 src/build_image_dataset.py \
-  --train_csv_path=${dataset_root_folder}/train/train.csv \
-  --train_clean_csv_path=${dataset_root_folder}/train/train_clean.csv \
-  --train_directory=${dataset_root_folder}/train/*/*/*/ \
-  --output_directory=${dataset_root_folder}/mindrecord/ \
-  --num_shards=128 \
-  --validation_split_size=0.2
-
-exit 0
+mkdir -p ${dataset_root_folder}/${split}
+
+# if csv files have downloaded success, please comment next 7 lines.
+csv_train="train.csv train_clean.csv train_attribution.csv train_label_to_category.csv"
+for file_name in ${csv_train}; do
+  echo "filename $file_name"
+  file_url=${metadata_url}/${file_name}
+  echo "Download $file_url to ${dataset_root_folder}/${split}/${file_name} ..."
+  wget ${file_url} -t 10 -O ${dataset_root_folder}/${split}/${file_name}
+done
+
+for i in $(seq ${begin_idx} 1 ${end_idx}); do
+  idx=`printf "%03g" $i`
+  images_md5_file=md5.images_${idx}.txt
+  images_tar_file=images_${idx}.tar
+  images_tar_file_url=${images_tar_file_base_url}/${split}/${images_tar_file}
+  images_md5_file_url=${images_md5_file_base_url}/${split}/${images_md5_file}
+
+  echo "Download ${images_tar_file_url} to ${dataset_root_folder}/${split}/${images_tar_file} ..."
+  wget ${images_tar_file_url} -t 10 -O ${dataset_root_folder}/${split}/${images_tar_file}
+  echo "Download ${images_md5_file} to ${dataset_root_folder}/${split}/${images_md5_file} ..."
+  wget ${images_md5_file_url} -t 10 -O ${dataset_root_folder}/${split}/${images_md5_file}
+done
\ No newline at end of file