From 59c077746f062c43905d13fc72e4b5bebecc9af9 Mon Sep 17 00:00:00 2001 From: Ezra-Yu <18586273+Ezra-Yu@users.noreply.github.com> Date: Fri, 30 Jun 2023 13:55:13 +0800 Subject: [PATCH] [Feat] Download dataset by using MIM&OpenDataLab (#1630) * add dataset.index * update preprocess shell * update shell * update docs * update docs --- MANIFEST.in | 1 + dataset-index.yml | 11 ++++++++ docs/en/user_guides/dataset_prepare.md | 25 +++++++++++++++++++ docs/zh_CN/user_guides/dataset_prepare.md | 25 +++++++++++++++++++ setup.py | 2 +- .../dataset_converters/odl_cub_preprocess.sh | 15 +++++++++++ .../odl_imagenet1k_preprocess.sh | 22 ++++++++++++++++ 7 files changed, 100 insertions(+), 1 deletion(-) create mode 100644 dataset-index.yml create mode 100755 tools/dataset_converters/odl_cub_preprocess.sh create mode 100755 tools/dataset_converters/odl_imagenet1k_preprocess.sh diff --git a/MANIFEST.in b/MANIFEST.in index c039b371104..ad4d8dafbde 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,4 +1,5 @@ include requirements/*.txt include mmpretrain/.mim/model-index.yml +include mmpretrain/.mim/dataset-index.yml recursive-include mmpretrain/.mim/configs *.py *.yml recursive-include mmpretrain/.mim/tools *.py *.sh diff --git a/dataset-index.yml b/dataset-index.yml new file mode 100644 index 00000000000..ecf7f5b59ca --- /dev/null +++ b/dataset-index.yml @@ -0,0 +1,11 @@ +imagenet1k: + dataset: ImageNet-1K + download_root: data + data_root: data/imagenet + script: tools/dataset_converters/odl_imagenet1k_preprocess.sh + +cub: + dataset: CUB-200-2011 + download_root: data + data_root: data/CUB_200_2011 + script: tools/dataset_converters/odl_cub_preprocess.sh diff --git a/docs/en/user_guides/dataset_prepare.md b/docs/en/user_guides/dataset_prepare.md index 1e25d6e24f4..2aac22927a6 100644 --- a/docs/en/user_guides/dataset_prepare.md +++ b/docs/en/user_guides/dataset_prepare.md @@ -140,12 +140,37 @@ For a complete example about how to use the `CustomDataset`, please see [How to ImageNet has multiple versions, but the most commonly used one is [ILSVRC 2012](http://www.image-net.org/challenges/LSVRC/2012/). It can be accessed with the following steps. +`````{tabs} + +````{group-tab} Download by MIM + +MIM supports downloading from [OpenDataLab](https://opendatalab.com/) and preprocessing ImageNet dataset with one command line. + +_You need to register an account at [OpenDataLab official website](https://opendatalab.com/) and login by CLI._ + +```Bash +# install OpenDataLab CLI tools +pip install -U opendatalab +# log in OpenDataLab, register if you don't have an account. +odl login +# download and preprocess by MIM, better to execute in $MMPreTrain directory. +mim download mmpretrain --dataset imagenet1k +``` + +```` + +````{group-tab} Download form Official Source + 1. Register an account and login to the [download page](http://www.image-net.org/download-images). 2. Find download links for ILSVRC2012 and download the following two files - ILSVRC2012_img_train.tar (~138GB) - ILSVRC2012_img_val.tar (~6.3GB) 3. Untar the downloaded files +```` + +````` + ### The Directory Structrue of the ImageNet dataset We support two ways of organizing the ImageNet dataset: Subfolder Format and Text Annotation File Format. diff --git a/docs/zh_CN/user_guides/dataset_prepare.md b/docs/zh_CN/user_guides/dataset_prepare.md index 9cebfc58d7b..59a0d0affbe 100644 --- a/docs/zh_CN/user_guides/dataset_prepare.md +++ b/docs/zh_CN/user_guides/dataset_prepare.md @@ -138,12 +138,37 @@ train_dataloader = dict( ImageNet 有多个版本,但最常用的一个是 [ILSVRC 2012](http://www.image-net.org/challenges/LSVRC/2012/)。 可以通过以下步骤使用它。 +`````{tabs} + +````{group-tab} MIM 下载 + +MIM支持使用一条命令行从 [OpenDataLab](https://opendatalab.com/) 下载并预处理 ImageNet 数据集。 + +_需要在 [OpenDataLab 官网](https://opendatalab.com/) 注册账号并命令行登录_。 + +```Bash +# 安装opendatalab库 +pip install -U opendatalab +# 登录到 OpenDataLab, 如果还没有注册,请到官网注册一个 +odl login +# 使用 MIM 下载数据集, 最好在 $MMPreTrain 目录执行 +mim download mmpretrain --dataset imagenet1k +``` + +```` + +````{group-tab} 从官网下载 + + 1. 注册一个帐户并登录到[下载页面](http://www.image-net.org/download-images)。 2. 找到 ILSVRC2012 的下载链接,下载以下两个文件: - ILSVRC2012_img_train.tar (~138GB) - ILSVRC2012_img_val.tar (~6.3GB) 3. 解压已下载的图片。 +```` +````` + ### ImageNet数据集目录结构 我们支持两种方式组织ImageNet数据集,子目录格式和文本注释文件格式。 diff --git a/setup.py b/setup.py index 6ed773f80dd..e68dff2be8d 100644 --- a/setup.py +++ b/setup.py @@ -117,7 +117,7 @@ def add_mim_extension(): else: return - filenames = ['tools', 'configs', 'model-index.yml'] + filenames = ['tools', 'configs', 'model-index.yml', 'dataset-index.yml'] repo_path = osp.dirname(__file__) mim_path = osp.join(repo_path, 'mmpretrain', '.mim') os.makedirs(mim_path, exist_ok=True) diff --git a/tools/dataset_converters/odl_cub_preprocess.sh b/tools/dataset_converters/odl_cub_preprocess.sh new file mode 100755 index 00000000000..6053d0e93b9 --- /dev/null +++ b/tools/dataset_converters/odl_cub_preprocess.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash + +set -x + +DOWNLOAD_DIR=$1 +DATA_ROOT=$2 + +# unzip all of data +cat $DOWNLOAD_DIR/CUB-200-2011/raw/*.tar.gz | tar -xvz -C $DOWNLOAD_DIR + +# move data into DATA_ROOT +mv -f $DOWNLOAD_DIR/CUB-200-2011/CUB-200-2011/* $DATA_ROOT/ + +# remove useless data file +rm -R $DOWNLOAD_DIR/CUB-200-2011/ diff --git a/tools/dataset_converters/odl_imagenet1k_preprocess.sh b/tools/dataset_converters/odl_imagenet1k_preprocess.sh new file mode 100755 index 00000000000..e73ba37247d --- /dev/null +++ b/tools/dataset_converters/odl_imagenet1k_preprocess.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash + +set -x + +DOWNLOAD_DIR=$1 +DATA_ROOT=$2 + +# unzip all of data +cat $DOWNLOAD_DIR/ImageNet-1K/raw/*.tar.gz.* | tar -xvz -C $DOWNLOAD_DIR + +# move images into data/imagenet +mv $DOWNLOAD_DIR/ImageNet-1K/{train,val,test} $DATA_ROOT + +# download the mate ann_files file +wget -P $DATA_ROOT https://download.openmmlab.com/mmclassification/datasets/imagenet/meta/caffe_ilsvrc12.tar.gz + +# unzip mate ann_files file and put it into 'meta' folder +mkdir $DATA_ROOT/meta +tar -xzvf $DATA_ROOT/caffe_ilsvrc12.tar.gz -C $DATA_ROOT/meta + +# remove useless data files +rm -R $DOWNLOAD_DIR/ImageNet-1K