From 90dab92464e94ac5c4139c37f07ed21df9c2affb Mon Sep 17 00:00:00 2001
From: pone7 <361357427@qq.com>
Date: Sat, 20 Aug 2022 07:17:57 +0800
Subject: [PATCH] update mixup benchmarks & awesome lists

---
 README.md                                     |   5 +-
 .../inaturalist2018/basic_sz224_4xbs64.py     |  54 +++++
 .../r18_l2_a2_near_L1_01_mlr5e_2_ep800.py     |   1 -
 .../basic/r18_l2_a2_near_L1_01_mlr5e_2.py     |   1 -
 .../basic/rx50_l2_a2_near_L1_01_mlr5e_2.py    |   1 -
 .../basic/wrn28_8_l1_a2_near_L1_01_mlr1e_3.py |   1 -
 ...e_2_lam_mul_k0_25_mask_adjust0_25_ep800.py |   1 -
 ...ili_val_dp0_mul_x_cat_L1_var_01_mlr5e_2.py |   1 -
 ...ili_val_dp0_mul_x_cat_L1_var_01_mlr5e_2.py |   1 -
 ...ili_val_dp0_mul_x_cat_L1_var_01_mlr1e_3.py |   1 -
 ...8_l2_a2_near_lam_cat_mb_mlr1e_3_bb_mlr0.py |   1 -
 ...ar_lam_cat_mb_mlr1e_3_m09_bb_mlr0_ep300.py |   1 -
 ...ear_lam_cat_mb_mlr1e_3_m0_bb_mlr0_ep100.py |   1 -
 ...am_cat_mb_mlr1e_3_m0_bb_mlr0_fp16_ep100.py |   1 -
 ...0_l2_a2_bili_lam_cat_mb_mlr1e_3_bb_mlr0.py |   1 -
 ...0_l2_a2_near_lam_cat_mb_mlr1e_3_bb_mlr0.py |   1 -
 ...i_val_dp01_mul_mb_mlr1e_3_bb_mlr0_4xb64.py |   1 -
 ..._mul_mb_mlr1e_3_m09_bb_mlr0_4xb64_ep300.py |   1 -
 ...1_mul_mb_mlr1e_3_m0_bb_mlr0_4xb64_ep100.py |   1 -
 ..._mb_mlr1e_3_m0_bb_mlr0_4xb64_fp16_ep100.py |   1 -
 ...i_val_dp01_mul_mb_mlr1e_3_bb_mlr0_4xb64.py |   1 -
 .../classification/inaturalist2017/README.md  |  61 ++++++
 ..._lam_cat_mb_mlr1e_3_bb_mlr0_4xb64_ep100.py |  48 +++++
 ...lam_cat_mb_mlr1e_4_bb_mlr0_4xb64_ep100.py} |   3 +-
 ...lam_cat_mb_mlr1e_4_bb_mlr0_4xb64_ep100.py} |   2 +-
 ...val_dp01_mb_mlr1e_3_bb_mlr0_4xb64_ep100.py |  62 ++++++
 ...al_dp01_mb_mlr1e_4_bb_mlr0_4xb64_ep100.py} |   3 +-
 ..._bili_val_dp01_mb_mlr1e_4_bb_mlr0_4xb64.py |   2 +-
 .../classification/inaturalist2018/README.md  |  61 ++++++
 ..._lam_cat_mb_mlr1e_4_bb_mlr0_4xb64_ep100.py |  80 +++++++
 ..._lam_cat_mb_mlr1e_4_bb_mlr0_4xb64_ep100.py |  12 ++
 .../mixups/basic/r18_mixups_CE_none_4xb64.py  |  16 ++
 .../mixups/basic/r50_mixups_CE_none_4xb64.py  |  43 ++++
 .../basic/rx101_mixups_CE_none_4xb64.py       |  16 ++
 ...val_dp01_mb_mlr1e_4_bb_mlr0_4xb64_ep100.py |  94 +++++++++
 ...val_dp01_mb_mlr1e_4_bb_mlr0_4xb64_ep100.py |  12 ++
 configs/classification/place205/README.md     |  61 ++++++
 ...lam_cat_mb_mlr1e_3_bb_mlr0_4xb64_ep100.py} |   3 +-
 ...lam_cat_mb_mlr1e_3_bb_mlr0_4xb64_ep100.py} |   1 -
 ...val_dp01_mb_mlr1e_4_bb_mlr0_4xb64_ep100.py |  50 +++++
 ...val_dp01_mb_mlr1e_4_bb_mlr0_4xb64_ep100.py |  88 ++++++++
 .../r18_l2_a2_near_mb_mlr1e_3_bb_mlr5e_2.py   |   1 -
 ..._l2_a2_near_mb_mlr1e_3_bb_mlr5e_2_2xb50.py |   1 -
 ...val_dp0_mul_mb_mlr1e_3_bb_mlr5e_2_ep400.py |   1 -
 ..._bili_val_dp0_mul_mb_mlr1e_3_bb_mlr5e_2.py |   1 -
 ...b50_lam_mul_k0_25_mask_adjust0_25_ep400.py |   1 -
 ...val_dp0_mul_mb_mlr1e_3_bb_mlr5e_2_2xb50.py |   1 -
 docs/en/awesome_mixups/Mixup_SL.md            |  12 +-
 docs/en/awesome_selfsup/MIM.md                |  24 +++
 openmixup/models/backbones/lan.py             | 183 ++++++++--------
 openmixup/models/heads/pmix_block.py          | 199 ++++--------------
 51 files changed, 932 insertions(+), 288 deletions(-)
 create mode 100644 configs/classification/_base_/datasets/inaturalist2018/basic_sz224_4xbs64.py
 create mode 100644 configs/classification/inaturalist2017/README.md
 create mode 100644 configs/classification/inaturalist2017/automix/basic/r18_l2_a2_near_lam_cat_mb_mlr1e_3_bb_mlr0_4xb64_ep100.py
 rename configs/classification/inaturalist2017/automix/basic/{r50_l2_a2_near_lam_cat_mb_mlr1e_4_bb_mlr0_4xb64.py => r50_l2_a2_near_lam_cat_mb_mlr1e_4_bb_mlr0_4xb64_ep100.py} (97%)
 rename configs/classification/inaturalist2017/automix/basic/{rx101_l2_a2_near_lam_cat_mb_mlr1e_4_bb_mlr0_4xb64.py => rx101_l2_a2_near_lam_cat_mb_mlr1e_4_bb_mlr0_4xb64_ep100.py} (79%)
 create mode 100644 configs/classification/inaturalist2017/samix/basic/r18_l2_a2_bili_val_dp01_mb_mlr1e_3_bb_mlr0_4xb64_ep100.py
 rename configs/classification/inaturalist2017/samix/basic/{r50_l2_a2_bili_val_dp01_mb_mlr1e_4_bb_mlr0_4xb64.py => r50_l2_a2_bili_val_dp01_mb_mlr1e_4_bb_mlr0_4xb64_ep100.py} (97%)
 create mode 100644 configs/classification/inaturalist2018/README.md
 create mode 100644 configs/classification/inaturalist2018/automix/basic/r50_l2_a2_near_lam_cat_mb_mlr1e_4_bb_mlr0_4xb64_ep100.py
 create mode 100644 configs/classification/inaturalist2018/automix/basic/rx101_l2_a2_near_lam_cat_mb_mlr1e_4_bb_mlr0_4xb64_ep100.py
 create mode 100644 configs/classification/inaturalist2018/mixups/basic/r18_mixups_CE_none_4xb64.py
 create mode 100644 configs/classification/inaturalist2018/mixups/basic/r50_mixups_CE_none_4xb64.py
 create mode 100644 configs/classification/inaturalist2018/mixups/basic/rx101_mixups_CE_none_4xb64.py
 create mode 100644 configs/classification/inaturalist2018/samix/basic/r50_l2_a2_bili_val_dp01_mb_mlr1e_4_bb_mlr0_4xb64_ep100.py
 create mode 100644 configs/classification/inaturalist2018/samix/basic/rx101_l2_a2_bili_val_dp01_mb_mlr1e_4_bb_mlr0_4xb64_ep100.py
 create mode 100644 configs/classification/place205/README.md
 rename configs/classification/place205/automix/basic/{r18_l2_a2_near_lam_cat_mb_mlr1e_3_bb_mlr0_4xb64.py => r18_l2_a2_near_lam_cat_mb_mlr1e_3_bb_mlr0_4xb64_ep100.py} (94%)
 rename configs/classification/place205/automix/basic/{r50_l2_a2_near_lam_cat_mb_mlr1e_3_bb_mlr0_4xb64.py => r50_l2_a2_near_lam_cat_mb_mlr1e_3_bb_mlr0_4xb64_ep100.py} (99%)
 create mode 100644 configs/classification/place205/samix/r18_l2_a2_bili_val_dp01_mb_mlr1e_4_bb_mlr0_4xb64_ep100.py
 create mode 100644 configs/classification/place205/samix/r50_l2_a2_bili_val_dp01_mb_mlr1e_4_bb_mlr0_4xb64_ep100.py

diff --git a/README.md b/README.md
index 542f940c..059aba2d 100644
--- a/README.md
+++ b/README.md
@@ -32,6 +32,8 @@ The main branch works with **PyTorch 1.8** (required by some self-supervised met
 
 ## What's New
 
+[2020-08-19] Weights and logs for mixup benchmarks are released.
+
 [2020-07-30] `OpenMixup` v0.2.5 is released (issue [#10](https://github.com/Westlake-AI/openmixup/issues/10)).
 
 ## Installation
@@ -127,7 +129,8 @@ Please refer to [Model Zoos](docs/en/model_zoos) for various backbones, mixup me
     - [x] [FGVC-Aircraft](https://arxiv.org/abs/1306.5151) [[download](https://www.robots.ox.ac.uk/~vgg/data/fgvc-aircraft/)] [[config](https://github.com/Westlake-AI/openmixup/tree/main/configs/classification/aircrafts/)]
     - [x] [StandfoldCars](http://ai.stanford.edu/~jkrause/papers/3drr13.pdf) [[download](http://ai.stanford.edu/~jkrause/cars/car_dataset.html)]
     - [x] [Place205](http://places2.csail.mit.edu/index.html) [[download](http://places.csail.mit.edu/downloadData.html)] [[config](https://github.com/Westlake-AI/openmixup/tree/main/configs/classification/place205/)]
-    - [x] [iNaturalist-2017/2018](https://arxiv.org/abs/1707.06642) [[download](https://github.com/visipedia/inat_comp)] [[config](https://github.com/Westlake-AI/openmixup/tree/main/configs/classification/inaturalist2017/)]
+    - [x] [iNaturalist-2017](https://arxiv.org/abs/1707.06642) [[download](https://github.com/visipedia/inat_comp/tree/master/2017)] [[config](https://github.com/Westlake-AI/openmixup/tree/main/configs/classification/inaturalist2017/)]
+    - [x] [iNaturalist-2018](https://arxiv.org/abs/1707.06642) [[download](https://github.com/visipedia/inat_comp/tree/master/2018)] [[config](https://github.com/Westlake-AI/openmixup/tree/main/configs/classification/inaturalist2018/)]
     </details>
 
 * Self-supervised algorithms for visual representation.
diff --git a/configs/classification/_base_/datasets/inaturalist2018/basic_sz224_4xbs64.py b/configs/classification/_base_/datasets/inaturalist2018/basic_sz224_4xbs64.py
new file mode 100644
index 00000000..1822fb73
--- /dev/null
+++ b/configs/classification/_base_/datasets/inaturalist2018/basic_sz224_4xbs64.py
@@ -0,0 +1,54 @@
+# dataset settings
+data_source_cfg = dict(type='ImageNet')
+# iNat dataset
+data_train_list = 'data/meta/iNaturalist2018/train_labeled_full.txt'
+data_train_root = 'data/iNaturalist2018/train'
+data_test_list = 'data/meta/iNaturalist2018/val_labeled.txt'
+data_test_root = 'data/iNaturalist2018/val/'
+
+dataset_type = 'ClassificationDataset'
+img_norm_cfg = dict(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+train_pipeline = [
+    dict(type='RandomResizedCrop', size=224, interpolation=3),  # bicubic
+    dict(type='RandomHorizontalFlip'),
+]
+test_pipeline = [
+    dict(type='Resize', size=256, interpolation=3),  # 0.85
+    dict(type='CenterCrop', size=224),
+    dict(type='ToTensor'),
+    dict(type='Normalize', **img_norm_cfg),
+]
+# prefetch
+prefetch = True
+if not prefetch:
+    train_pipeline.extend([dict(type='ToTensor'), dict(type='Normalize', **img_norm_cfg)])
+
+data = dict(
+    imgs_per_gpu=64,  # V100: 64 x 4gpus = bs256
+    workers_per_gpu=8,  # according to total cpus cores, usually 4 workers per 32~128 imgs
+    train=dict(
+        type=dataset_type,
+        data_source=dict(
+            list_file=data_train_list, root=data_train_root,
+            **data_source_cfg),
+        pipeline=train_pipeline,
+        prefetch=prefetch,
+    ),
+    val=dict(
+        type=dataset_type,
+        data_source=dict(
+            list_file=data_test_list, root=data_test_root, **data_source_cfg),
+        pipeline=test_pipeline,
+        prefetch=False,
+    ))
+
+# validation hook
+evaluation = dict(
+    initial=False,
+    interval=1,
+    imgs_per_gpu=128,
+    workers_per_gpu=4,
+    eval_param=dict(topk=(1, 5)))
+
+# checkpoint
+checkpoint_config = dict(interval=1, max_keep_ckpts=1)
diff --git a/configs/classification/cifar100/automix/basic/r18/unsampling_modenearest/r18_l2_a2_near_L1_01_mlr5e_2_ep800.py b/configs/classification/cifar100/automix/basic/r18/unsampling_modenearest/r18_l2_a2_near_L1_01_mlr5e_2_ep800.py
index 556ca493..cbf5a780 100644
--- a/configs/classification/cifar100/automix/basic/r18/unsampling_modenearest/r18_l2_a2_near_L1_01_mlr5e_2_ep800.py
+++ b/configs/classification/cifar100/automix/basic/r18/unsampling_modenearest/r18_l2_a2_near_L1_01_mlr5e_2_ep800.py
@@ -31,7 +31,6 @@
         x_qk_concat=False, x_v_concat=False,  # SAMix x concat: none
         # att_norm_cfg=dict(type='BN'),  # norm after q,k (design for fp16, also conduct better performace in fp32)
         mask_loss_mode="L1", mask_loss_margin=0.1,  # L1 loss, 0.1
-        mask_mode="none_v_",
         frozen=False),
     head_one=dict(
         type='ClsHead',  # default CE
diff --git a/configs/classification/cifar100/automix/basic/r18_l2_a2_near_L1_01_mlr5e_2.py b/configs/classification/cifar100/automix/basic/r18_l2_a2_near_L1_01_mlr5e_2.py
index c4101800..a22289fd 100644
--- a/configs/classification/cifar100/automix/basic/r18_l2_a2_near_L1_01_mlr5e_2.py
+++ b/configs/classification/cifar100/automix/basic/r18_l2_a2_near_L1_01_mlr5e_2.py
@@ -31,7 +31,6 @@
         x_qk_concat=False, x_v_concat=False,  # SAMix x concat: none
         # att_norm_cfg=dict(type='BN'),  # norm after q,k (design for fp16, also conduct better performace in fp32)
         mask_loss_mode="L1", mask_loss_margin=0.1,  # L1 loss, 0.1
-        mask_mode="none_v_",
         frozen=False),
     head_one=dict(
         type='ClsHead',  # default CE
diff --git a/configs/classification/cifar100/automix/basic/rx50_l2_a2_near_L1_01_mlr5e_2.py b/configs/classification/cifar100/automix/basic/rx50_l2_a2_near_L1_01_mlr5e_2.py
index 9d4fd095..2070d54e 100644
--- a/configs/classification/cifar100/automix/basic/rx50_l2_a2_near_L1_01_mlr5e_2.py
+++ b/configs/classification/cifar100/automix/basic/rx50_l2_a2_near_L1_01_mlr5e_2.py
@@ -32,7 +32,6 @@
         x_qk_concat=False, x_v_concat=False,  # SAMix x concat: none
         # att_norm_cfg=dict(type='BN'),  # norm after q,k (design for fp16, also conduct better performace in fp32)
         mask_loss_mode="L1", mask_loss_margin=0.1,  # L1 loss, 0.1
-        mask_mode="none_v_",
         frozen=False),
     head_one=dict(
         type='ClsHead',  # default CE
diff --git a/configs/classification/cifar100/automix/basic/wrn28_8_l1_a2_near_L1_01_mlr1e_3.py b/configs/classification/cifar100/automix/basic/wrn28_8_l1_a2_near_L1_01_mlr1e_3.py
index d4ed3c07..08c5d37a 100644
--- a/configs/classification/cifar100/automix/basic/wrn28_8_l1_a2_near_L1_01_mlr1e_3.py
+++ b/configs/classification/cifar100/automix/basic/wrn28_8_l1_a2_near_L1_01_mlr1e_3.py
@@ -33,7 +33,6 @@
         x_qk_concat=False, x_v_concat=False,  # SAMix x concat: none
         # att_norm_cfg=dict(type='BN'),  # norm after q,k (design for fp16, also conduct better performace in fp32)
         mask_loss_mode="L1", mask_loss_margin=0.1,  # L1 loss, 0.1
-        mask_mode="none_v_",
         frozen=False),
     head_one=dict(
         type='ClsHead',  # default CE
diff --git a/configs/classification/cifar100/samix/basic/r18/unsampling_modebilinear/r18_l2_a2_bili_val_dp0_mul_x_cat_L1_var_01_mlr5e_2_lam_mul_k0_25_mask_adjust0_25_ep800.py b/configs/classification/cifar100/samix/basic/r18/unsampling_modebilinear/r18_l2_a2_bili_val_dp0_mul_x_cat_L1_var_01_mlr5e_2_lam_mul_k0_25_mask_adjust0_25_ep800.py
index 090c4230..6e583051 100644
--- a/configs/classification/cifar100/samix/basic/r18/unsampling_modebilinear/r18_l2_a2_bili_val_dp0_mul_x_cat_L1_var_01_mlr5e_2_lam_mul_k0_25_mask_adjust0_25_ep800.py
+++ b/configs/classification/cifar100/samix/basic/r18/unsampling_modebilinear/r18_l2_a2_bili_val_dp0_mul_x_cat_L1_var_01_mlr5e_2_lam_mul_k0_25_mask_adjust0_25_ep800.py
@@ -39,7 +39,6 @@
         x_qk_concat=True, x_v_concat=False,  # SAMix x concat: q,k
         # att_norm_cfg=dict(type='BN'),  # norm after q,k (design for fp16, also conduct better performace in fp32)
         mask_loss_mode="L1+Variance", mask_loss_margin=0.1,  # L1+Var loss, tricks in SAMix
-        mask_mode="none_v_",
         frozen=False),
     head_one=dict(
         type='ClsHead',  # default CE
diff --git a/configs/classification/cifar100/samix/basic/r18_l2_a2_bili_val_dp0_mul_x_cat_L1_var_01_mlr5e_2.py b/configs/classification/cifar100/samix/basic/r18_l2_a2_bili_val_dp0_mul_x_cat_L1_var_01_mlr5e_2.py
index 35859ed7..1fd11c6a 100644
--- a/configs/classification/cifar100/samix/basic/r18_l2_a2_bili_val_dp0_mul_x_cat_L1_var_01_mlr5e_2.py
+++ b/configs/classification/cifar100/samix/basic/r18_l2_a2_bili_val_dp0_mul_x_cat_L1_var_01_mlr5e_2.py
@@ -39,7 +39,6 @@
         x_qk_concat=True, x_v_concat=False,  # SAMix x concat: q,k
         # att_norm_cfg=dict(type='BN'),  # norm after q,k (design for fp16, also conduct better performace in fp32)
         mask_loss_mode="L1+Variance", mask_loss_margin=0.1,  # L1+Var loss, tricks in SAMix
-        mask_mode="none_v_",
         frozen=False),
     head_one=dict(
         type='ClsHead',  # default CE
diff --git a/configs/classification/cifar100/samix/basic/rx50_l2_a2_bili_val_dp0_mul_x_cat_L1_var_01_mlr5e_2.py b/configs/classification/cifar100/samix/basic/rx50_l2_a2_bili_val_dp0_mul_x_cat_L1_var_01_mlr5e_2.py
index 734cab32..5fae4e01 100644
--- a/configs/classification/cifar100/samix/basic/rx50_l2_a2_bili_val_dp0_mul_x_cat_L1_var_01_mlr5e_2.py
+++ b/configs/classification/cifar100/samix/basic/rx50_l2_a2_bili_val_dp0_mul_x_cat_L1_var_01_mlr5e_2.py
@@ -40,7 +40,6 @@
         x_qk_concat=True, x_v_concat=False,  # SAMix x concat: q,k
         # att_norm_cfg=dict(type='BN'),  # norm after q,k (design for fp16, also conduct better performace in fp32)
         mask_loss_mode="L1+Variance", mask_loss_margin=0.1,  # L1+Var loss, tricks in SAMix
-        mask_mode="none_v_",
         frozen=False),
     head_one=dict(
         type='ClsHead',  # default CE
diff --git a/configs/classification/cifar100/samix/basic/wrn28_8_l1_a2_bili_val_dp0_mul_x_cat_L1_var_01_mlr1e_3.py b/configs/classification/cifar100/samix/basic/wrn28_8_l1_a2_bili_val_dp0_mul_x_cat_L1_var_01_mlr1e_3.py
index d153663e..77ff7952 100644
--- a/configs/classification/cifar100/samix/basic/wrn28_8_l1_a2_bili_val_dp0_mul_x_cat_L1_var_01_mlr1e_3.py
+++ b/configs/classification/cifar100/samix/basic/wrn28_8_l1_a2_bili_val_dp0_mul_x_cat_L1_var_01_mlr1e_3.py
@@ -41,7 +41,6 @@
         x_qk_concat=True, x_v_concat=False,  # SAMix x concat: q,k
         # att_norm_cfg=dict(type='BN'),  # norm after q,k (design for fp16, also conduct better performace in fp32)
         mask_loss_mode="L1+Variance", mask_loss_margin=0.1,  # L1+Var loss, tricks in SAMix
-        mask_mode="none_v_",
         frozen=False),
     head_one=dict(
         type='ClsHead',  # default CE
diff --git a/configs/classification/imagenet/automix/basic/r18_l2_a2_near_lam_cat_mb_mlr1e_3_bb_mlr0.py b/configs/classification/imagenet/automix/basic/r18_l2_a2_near_lam_cat_mb_mlr1e_3_bb_mlr0.py
index 6063a015..2d5d36a9 100644
--- a/configs/classification/imagenet/automix/basic/r18_l2_a2_near_lam_cat_mb_mlr1e_3_bb_mlr0.py
+++ b/configs/classification/imagenet/automix/basic/r18_l2_a2_near_lam_cat_mb_mlr1e_3_bb_mlr0.py
@@ -31,7 +31,6 @@
         x_qk_concat=False, x_v_concat=False,  # SAMix x concat: none
         # att_norm_cfg=dict(type='BN'),  # norm after q,k (design for fp16, also conduct better performace in fp32)
         mask_loss_mode="L1", mask_loss_margin=0.1,  # L1 loss, 0.1
-        mask_mode="none_v_",
         frozen=False),
     head_one=dict(
         type='ClsHead',  # default CE
diff --git a/configs/classification/imagenet/automix/basic/r50/unsampling_modenearest/r50_l2_a2_near_lam_cat_mb_mlr1e_3_m09_bb_mlr0_ep300.py b/configs/classification/imagenet/automix/basic/r50/unsampling_modenearest/r50_l2_a2_near_lam_cat_mb_mlr1e_3_m09_bb_mlr0_ep300.py
index d99e3098..8c7b1c34 100644
--- a/configs/classification/imagenet/automix/basic/r50/unsampling_modenearest/r50_l2_a2_near_lam_cat_mb_mlr1e_3_m09_bb_mlr0_ep300.py
+++ b/configs/classification/imagenet/automix/basic/r50/unsampling_modenearest/r50_l2_a2_near_lam_cat_mb_mlr1e_3_m09_bb_mlr0_ep300.py
@@ -31,7 +31,6 @@
         x_qk_concat=False, x_v_concat=False,  # SAMix x concat: none
         # att_norm_cfg=dict(type='BN'),  # norm after q,k (design for fp16, also conduct better performace in fp32)
         mask_loss_mode="L1", mask_loss_margin=0.1,  # L1 loss, 0.1
-        mask_mode="none_v_",
         frozen=False),
     head_one=dict(
         type='ClsHead',  # default CE
diff --git a/configs/classification/imagenet/automix/basic/r50/unsampling_modenearest/r50_l2_a2_near_lam_cat_mb_mlr1e_3_m0_bb_mlr0_ep100.py b/configs/classification/imagenet/automix/basic/r50/unsampling_modenearest/r50_l2_a2_near_lam_cat_mb_mlr1e_3_m0_bb_mlr0_ep100.py
index e23c389e..68520436 100644
--- a/configs/classification/imagenet/automix/basic/r50/unsampling_modenearest/r50_l2_a2_near_lam_cat_mb_mlr1e_3_m0_bb_mlr0_ep100.py
+++ b/configs/classification/imagenet/automix/basic/r50/unsampling_modenearest/r50_l2_a2_near_lam_cat_mb_mlr1e_3_m0_bb_mlr0_ep100.py
@@ -31,7 +31,6 @@
         x_qk_concat=False, x_v_concat=False,  # SAMix x concat: none
         # att_norm_cfg=dict(type='BN'),  # norm after q,k (design for fp16, also conduct better performace in fp32)
         mask_loss_mode="L1", mask_loss_margin=0.1,  # L1 loss, 0.1
-        mask_mode="none_v_",
         frozen=False),
     head_one=dict(
         type='ClsHead',  # default CE
diff --git a/configs/classification/imagenet/automix/basic/r50/unsampling_modenearest/r50_l2_a2_near_lam_cat_mb_mlr1e_3_m0_bb_mlr0_fp16_ep100.py b/configs/classification/imagenet/automix/basic/r50/unsampling_modenearest/r50_l2_a2_near_lam_cat_mb_mlr1e_3_m0_bb_mlr0_fp16_ep100.py
index 86b7d892..7bfd8750 100644
--- a/configs/classification/imagenet/automix/basic/r50/unsampling_modenearest/r50_l2_a2_near_lam_cat_mb_mlr1e_3_m0_bb_mlr0_fp16_ep100.py
+++ b/configs/classification/imagenet/automix/basic/r50/unsampling_modenearest/r50_l2_a2_near_lam_cat_mb_mlr1e_3_m0_bb_mlr0_fp16_ep100.py
@@ -31,7 +31,6 @@
         x_qk_concat=False, x_v_concat=False,  # SAMix x concat: none
         # att_norm_cfg=dict(type='BN'),  # norm after q,k (design for fp16, also conduct better performace in fp32)
         mask_loss_mode="L1", mask_loss_margin=0.1,  # L1 loss, 0.1
-        mask_mode="none_v_",
         frozen=False),
     head_one=dict(
         type='ClsHead',  # default CE
diff --git a/configs/classification/imagenet/automix/basic/r50_l2_a2_bili_lam_cat_mb_mlr1e_3_bb_mlr0.py b/configs/classification/imagenet/automix/basic/r50_l2_a2_bili_lam_cat_mb_mlr1e_3_bb_mlr0.py
index 997bd83e..a400d299 100644
--- a/configs/classification/imagenet/automix/basic/r50_l2_a2_bili_lam_cat_mb_mlr1e_3_bb_mlr0.py
+++ b/configs/classification/imagenet/automix/basic/r50_l2_a2_bili_lam_cat_mb_mlr1e_3_bb_mlr0.py
@@ -31,7 +31,6 @@
         x_qk_concat=False, x_v_concat=False,  # SAMix x concat: none
         # att_norm_cfg=dict(type='BN'),  # norm after q,k (design for fp16, also conduct better performace in fp32)
         mask_loss_mode="L1+Variance", mask_loss_margin=0.1,  # L1+Var loss, tricks in SAMix
-        mask_mode="none_v_",
         frozen=False),
     head_one=dict(
         type='ClsHead',  # default CE
diff --git a/configs/classification/imagenet/automix/basic/r50_l2_a2_near_lam_cat_mb_mlr1e_3_bb_mlr0.py b/configs/classification/imagenet/automix/basic/r50_l2_a2_near_lam_cat_mb_mlr1e_3_bb_mlr0.py
index 15faef1c..f420400d 100644
--- a/configs/classification/imagenet/automix/basic/r50_l2_a2_near_lam_cat_mb_mlr1e_3_bb_mlr0.py
+++ b/configs/classification/imagenet/automix/basic/r50_l2_a2_near_lam_cat_mb_mlr1e_3_bb_mlr0.py
@@ -31,7 +31,6 @@
         x_qk_concat=False, x_v_concat=False,  # SAMix x concat: none
         # att_norm_cfg=dict(type='BN'),  # norm after q,k (design for fp16, also conduct better performace in fp32)
         mask_loss_mode="L1", mask_loss_margin=0.1,  # L1 loss, 0.1
-        mask_mode="none_v_",
         frozen=False),
     head_one=dict(
         type='ClsHead',  # default CE
diff --git a/configs/classification/imagenet/samix/basic/r18_l2_a2_bili_val_dp01_mul_mb_mlr1e_3_bb_mlr0_4xb64.py b/configs/classification/imagenet/samix/basic/r18_l2_a2_bili_val_dp01_mul_mb_mlr1e_3_bb_mlr0_4xb64.py
index ed095dd2..fb65e9b9 100644
--- a/configs/classification/imagenet/samix/basic/r18_l2_a2_bili_val_dp01_mul_mb_mlr1e_3_bb_mlr0_4xb64.py
+++ b/configs/classification/imagenet/samix/basic/r18_l2_a2_bili_val_dp01_mul_mb_mlr1e_3_bb_mlr0_4xb64.py
@@ -39,7 +39,6 @@
         x_qk_concat=True, x_v_concat=False,  # SAMix x concat: q,k
         # att_norm_cfg=dict(type='BN'),  # norm after q,k (design for fp16, also conduct better performace in fp32)
         mask_loss_mode="L1+Variance", mask_loss_margin=0.1,  # L1+Var loss, tricks in SAMix
-        mask_mode="none_v_",
         frozen=False),
     head_one=dict(
         type='ClsHead',  # default CE
diff --git a/configs/classification/imagenet/samix/basic/r50/unsampling_modebilinear/r50_l2_a2_bili_val_dp01_mul_mb_mlr1e_3_m09_bb_mlr0_4xb64_ep300.py b/configs/classification/imagenet/samix/basic/r50/unsampling_modebilinear/r50_l2_a2_bili_val_dp01_mul_mb_mlr1e_3_m09_bb_mlr0_4xb64_ep300.py
index f7f6b5f0..c4d39f3f 100644
--- a/configs/classification/imagenet/samix/basic/r50/unsampling_modebilinear/r50_l2_a2_bili_val_dp01_mul_mb_mlr1e_3_m09_bb_mlr0_4xb64_ep300.py
+++ b/configs/classification/imagenet/samix/basic/r50/unsampling_modebilinear/r50_l2_a2_bili_val_dp01_mul_mb_mlr1e_3_m09_bb_mlr0_4xb64_ep300.py
@@ -39,7 +39,6 @@
         x_qk_concat=True, x_v_concat=False,  # SAMix x concat: q,k
         # att_norm_cfg=dict(type='BN'),  # norm after q,k (design for fp16, also conduct better performace in fp32)
         mask_loss_mode="L1+Variance", mask_loss_margin=0.1,  # L1+Var loss, tricks in SAMix
-        mask_mode="none_v_",
         frozen=False),
     head_one=dict(
         type='ClsHead',  # default CE
diff --git a/configs/classification/imagenet/samix/basic/r50/unsampling_modebilinear/r50_l2_a2_bili_val_dp01_mul_mb_mlr1e_3_m0_bb_mlr0_4xb64_ep100.py b/configs/classification/imagenet/samix/basic/r50/unsampling_modebilinear/r50_l2_a2_bili_val_dp01_mul_mb_mlr1e_3_m0_bb_mlr0_4xb64_ep100.py
index 6a87843d..b85056c4 100644
--- a/configs/classification/imagenet/samix/basic/r50/unsampling_modebilinear/r50_l2_a2_bili_val_dp01_mul_mb_mlr1e_3_m0_bb_mlr0_4xb64_ep100.py
+++ b/configs/classification/imagenet/samix/basic/r50/unsampling_modebilinear/r50_l2_a2_bili_val_dp01_mul_mb_mlr1e_3_m0_bb_mlr0_4xb64_ep100.py
@@ -39,7 +39,6 @@
         x_qk_concat=True, x_v_concat=False,  # SAMix x concat: q,k
         # att_norm_cfg=dict(type='BN'),  # norm after q,k (design for fp16, also conduct better performace in fp32)
         mask_loss_mode="L1+Variance", mask_loss_margin=0.1,  # L1+Var loss, tricks in SAMix
-        mask_mode="none_v_",
         frozen=False),
     head_one=dict(
         type='ClsHead',  # default CE
diff --git a/configs/classification/imagenet/samix/basic/r50/unsampling_modebilinear/r50_l2_a2_bili_val_dp01_mul_mb_mlr1e_3_m0_bb_mlr0_4xb64_fp16_ep100.py b/configs/classification/imagenet/samix/basic/r50/unsampling_modebilinear/r50_l2_a2_bili_val_dp01_mul_mb_mlr1e_3_m0_bb_mlr0_4xb64_fp16_ep100.py
index 3ebc498b..7684f5f8 100644
--- a/configs/classification/imagenet/samix/basic/r50/unsampling_modebilinear/r50_l2_a2_bili_val_dp01_mul_mb_mlr1e_3_m0_bb_mlr0_4xb64_fp16_ep100.py
+++ b/configs/classification/imagenet/samix/basic/r50/unsampling_modebilinear/r50_l2_a2_bili_val_dp01_mul_mb_mlr1e_3_m0_bb_mlr0_4xb64_fp16_ep100.py
@@ -39,7 +39,6 @@
         x_qk_concat=True, x_v_concat=False,  # SAMix x concat: q,k
         # att_norm_cfg=dict(type='BN'),  # norm after q,k (design for fp16, also conduct better performace in fp32)
         mask_loss_mode="L1+Variance", mask_loss_margin=0.1,  # L1+Var loss, tricks in SAMix
-        mask_mode="none_v_",
         frozen=False),
     head_one=dict(
         type='ClsHead',  # default CE
diff --git a/configs/classification/imagenet/samix/basic/r50_l2_a2_bili_val_dp01_mul_mb_mlr1e_3_bb_mlr0_4xb64.py b/configs/classification/imagenet/samix/basic/r50_l2_a2_bili_val_dp01_mul_mb_mlr1e_3_bb_mlr0_4xb64.py
index bd3753ea..72820272 100644
--- a/configs/classification/imagenet/samix/basic/r50_l2_a2_bili_val_dp01_mul_mb_mlr1e_3_bb_mlr0_4xb64.py
+++ b/configs/classification/imagenet/samix/basic/r50_l2_a2_bili_val_dp01_mul_mb_mlr1e_3_bb_mlr0_4xb64.py
@@ -39,7 +39,6 @@
         x_qk_concat=True, x_v_concat=False,  # SAMix x concat: q,k
         # att_norm_cfg=dict(type='BN'),  # norm after q,k (design for fp16, also conduct better performace in fp32)
         mask_loss_mode="L1+Variance", mask_loss_margin=0.1,  # L1+Var loss, tricks in SAMix
-        mask_mode="none_v_",
         frozen=False),
     head_one=dict(
         type='ClsHead',  # default CE
diff --git a/configs/classification/inaturalist2017/README.md b/configs/classification/inaturalist2017/README.md
new file mode 100644
index 00000000..afe2890f
--- /dev/null
+++ b/configs/classification/inaturalist2017/README.md
@@ -0,0 +1,61 @@
+# Mixup Classification Benchmark on iNaturalist-2017
+
+> [The iNaturalist Challenge 2017 Dataset](https://arxiv.org/abs/1707.06642)
+
+## Abstract
+
+Existing image classification datasets used in computer vision tend to have an even number of images for each object category. In contrast, the natural world is heavily imbalanced, as some species are more abundant and easier to photograph than others. To encourage further progress in challenging real world conditions we present the iNaturalist Challenge 2017 dataset - an image classification benchmark consisting of 675,000 images with over 5,000 different species of plants and animals. It features many visually similar species, captured in a wide variety of situations, from all over the world. Images were collected with different camera types, have varying image quality, have been verified by multiple citizen scientists, and feature a large class imbalance. We discuss the collection of the dataset and present baseline results for state-of-the-art computer vision classification models. Results show that current non-ensemble based methods achieve only 64% top one classification accuracy, illustrating the difficulty of the dataset. Finally, we report results from a competition that was held with the data.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/44519745/185645586-a5317b04-daf4-429c-a0fa-c9dd87dac45b.png" width="100%"/>
+</div>
+
+## Results and models
+
+We provide a collection of [weights and logs](https://github.com/Westlake-AI/openmixup/releases/tag/mixup-inat2017-weights) for mixup classification benchmark on iNaturalist-2017. You can download all results from **Baidu Cloud**: [iNaturalist-2017 (1e7w)](https://pan.baidu.com/s/1GsoXVpIBXPjyFKsCdnmp9Q).
+
+* All compared methods adopt ResNet-18/50 and ResNeXt-101 (32x4d) architectures and are trained 100 epochs using the PyTorch training recipe. The training and testing image size is 224 with the CenterCrop ratio of 0.85. We search $\alpha$ in $Beta(\alpha, \alpha)$ for all compared methods.
+* The **median** of top-1 accuracy in the last 5 training epochs is reported for ResNet variants.
+* Visualization of mixed samples from [AutoMix](https://arxiv.org/abs/2103.13027) and [SAMix](https://arxiv.org/abs/2111.15454) are provided in zip files.
+
+### iNaturalist-2017
+
+| Backbones                                                   | ResNet-18 top-1 | ResNet-50 top-1 | ResNeXt-101 top-1 |
+|-------------------------------------------------------------|:---------------:|:---------------:|:-----------------:|
+| Vanilla                                                     |      51.79      |      60.23      |       63.70       |
+| MixUp [[ICLR'2018](https://arxiv.org/abs/1710.09412)]       |      51.40      |      61.22      |       66.27       |
+| CutMix [[ICCV'2019](https://arxiv.org/abs/1905.04899)]      |      51.24      |      62.34      |       67.59       |
+| ManifoldMix [[ICML'2019](https://arxiv.org/abs/1806.05236)] |      51.83      |      61.47      |       66.08       |
+| SaliencyMix [[ICLR'2021](https://arxiv.org/abs/2006.01791)] |      51.29      |      62.51      |       67.20       |
+| FMix [[Arixv'2020](https://arxiv.org/abs/2002.12047)]       |      52.01      |      61.90      |       66.64       |
+| PuzzleMix [[ICML'2020](https://arxiv.org/abs/2009.06962)]   |        -        |      62.66      |       67.72       |
+| ResizeMix [[Arixv'2020](https://arxiv.org/abs/2012.11101)]  |      51.21      |      62.29      |       66.82       |
+| AutoMix [[ECCV'2022](https://arxiv.org/abs/2103.13027)]     |      52.84      |      63.08      |       68.03       |
+| SAMix [[Arxiv'2021](https://arxiv.org/abs/2111.15454)]      |      53.42      |      63.32      |       68.26       |
+
+We summarize mixup benchmarks in [Model Zoo](https://github.com/Westlake-AI/openmixup/tree/main/docs/en/model_zoos/Model_Zoo_sup.md).
+
+
+## Citation
+
+Please refer to the original paper of iNaturalist-2017 and AutoMix for details.
+
+```bibtex
+@article{Horn2017TheIC,
+  title={The iNaturalist Challenge 2017 Dataset},
+  author={Grant Van Horn and Oisin Mac Aodha and Yang Song and Alexander Shepard and Hartwig Adam and Pietro Perona and Serge J. Belongie},
+  journal={ArXiv},
+  year={2017},
+  volume={abs/1707.06642}
+}
+```
+```bibtex
+@misc{eccv2022automix,
+  title={AutoMix: Unveiling the Power of Mixup for Stronger Classifiers},
+  author={Zicheng Liu and Siyuan Li and Di Wu and Zhiyuan Chen and Lirong Wu and Jianzhu Guo and Stan Z. Li},
+  year={2021},
+  eprint={2103.13027},
+  archivePrefix={arXiv},
+  primaryClass={cs.CV}
+}
+```
diff --git a/configs/classification/inaturalist2017/automix/basic/r18_l2_a2_near_lam_cat_mb_mlr1e_3_bb_mlr0_4xb64_ep100.py b/configs/classification/inaturalist2017/automix/basic/r18_l2_a2_near_lam_cat_mb_mlr1e_3_bb_mlr0_4xb64_ep100.py
new file mode 100644
index 00000000..8f1bdc6d
--- /dev/null
+++ b/configs/classification/inaturalist2017/automix/basic/r18_l2_a2_near_lam_cat_mb_mlr1e_3_bb_mlr0_4xb64_ep100.py
@@ -0,0 +1,48 @@
+_base_ = "r50_l2_a2_near_lam_cat_mb_mlr1e_4_bb_mlr0_4xb64_ep100.py"
+
+# model settings
+model = dict(
+    type='AutoMixup',
+    pretrained=None,
+    alpha=2.0,
+    momentum=0.999,  # 0.999 to 0.99999
+    mask_layer=2,
+    mask_loss=0.1,  # using mask loss
+    mask_adjust=0,
+    lam_margin=0.08,  # degenerate to mixup when lam or 1-lam <= 0.08
+    mask_up_override=None,  # If not none, override upsampling when train MixBlock
+    debug=False,  # show attention and content map
+    backbone=dict(
+        type='ResNet',
+        depth=18,
+        num_stages=4,
+        out_indices=(2,3),  # stage-3 for MixBlock, x-1: stage-x
+        style='pytorch'),
+    mix_block = dict(  # AutoMix
+        type='PixelMixBlock',
+        in_channels=256, reduction=2, use_scale=True,
+        unsampling_mode=['nearest',],  # str or list, train & test MixBlock
+        lam_concat=True, lam_concat_v=False,  # AutoMix: lam cat q,k,v
+        lam_mul=False, lam_residual=False, lam_mul_k=-1,  # SAMix lam: none
+        value_neck_cfg=None,  # SAMix: non-linear value
+        x_qk_concat=False, x_v_concat=False,  # SAMix x concat: none
+        # att_norm_cfg=dict(type='BN'),  # norm after q,k (design for fp16, also conduct better performace in fp32)
+        mask_loss_mode="L1", mask_loss_margin=0.1,  # L1 loss, 0.1
+        frozen=False),
+    head_one=dict(
+        type='ClsHead',  # default CE
+        loss=dict(type='CrossEntropyLoss', use_soft=False, use_sigmoid=False, loss_weight=1.0),
+        with_avg_pool=True, multi_label=False, in_channels=512, num_classes=5089),
+    head_mix=dict(  # backbone & mixblock
+        type='ClsMixupHead',  # mixup, default CE
+        loss=dict(type='CrossEntropyLoss', use_soft=False, use_sigmoid=False, loss_weight=1.0),
+        with_avg_pool=True, multi_label=False, in_channels=512, num_classes=5089),
+    head_weights=dict(
+        head_mix_q=1, head_one_q=1, head_mix_k=1, head_one_k=1),
+)
+
+# additional scheduler
+addtional_scheduler = dict(
+    policy='CosineAnnealing', min_lr=1e-3,
+    paramwise_options=['mix_block'],
+)
diff --git a/configs/classification/inaturalist2017/automix/basic/r50_l2_a2_near_lam_cat_mb_mlr1e_4_bb_mlr0_4xb64.py b/configs/classification/inaturalist2017/automix/basic/r50_l2_a2_near_lam_cat_mb_mlr1e_4_bb_mlr0_4xb64_ep100.py
similarity index 97%
rename from configs/classification/inaturalist2017/automix/basic/r50_l2_a2_near_lam_cat_mb_mlr1e_4_bb_mlr0_4xb64.py
rename to configs/classification/inaturalist2017/automix/basic/r50_l2_a2_near_lam_cat_mb_mlr1e_4_bb_mlr0_4xb64_ep100.py
index b958126d..d11b9d77 100644
--- a/configs/classification/inaturalist2017/automix/basic/r50_l2_a2_near_lam_cat_mb_mlr1e_4_bb_mlr0_4xb64.py
+++ b/configs/classification/inaturalist2017/automix/basic/r50_l2_a2_near_lam_cat_mb_mlr1e_4_bb_mlr0_4xb64_ep100.py
@@ -31,7 +31,6 @@
         x_qk_concat=False, x_v_concat=False,  # SAMix x concat: none
         # att_norm_cfg=dict(type='BN'),  # norm after q,k (design for fp16, also conduct better performace in fp32)
         mask_loss_mode="L1", mask_loss_margin=0.1,  # L1 loss, 0.1
-        mask_mode="none_v_",
         frozen=False),
     head_one=dict(
         type='ClsHead',  # default CE
@@ -48,7 +47,7 @@
 # additional hooks
 custom_hooks = [
     dict(type='SAVEHook',
-        save_interval=22630,  # plot every 2263 x 10ep
+        save_interval=2263 * 10,  # plot every 2263 x 10ep
         iter_per_epoch=2263,
     ),
     dict(type='CustomCosineAnnealingHook',  # 0.1 to 0
diff --git a/configs/classification/inaturalist2017/automix/basic/rx101_l2_a2_near_lam_cat_mb_mlr1e_4_bb_mlr0_4xb64.py b/configs/classification/inaturalist2017/automix/basic/rx101_l2_a2_near_lam_cat_mb_mlr1e_4_bb_mlr0_4xb64_ep100.py
similarity index 79%
rename from configs/classification/inaturalist2017/automix/basic/rx101_l2_a2_near_lam_cat_mb_mlr1e_4_bb_mlr0_4xb64.py
rename to configs/classification/inaturalist2017/automix/basic/rx101_l2_a2_near_lam_cat_mb_mlr1e_4_bb_mlr0_4xb64_ep100.py
index 26c169f4..a473f5db 100644
--- a/configs/classification/inaturalist2017/automix/basic/rx101_l2_a2_near_lam_cat_mb_mlr1e_4_bb_mlr0_4xb64.py
+++ b/configs/classification/inaturalist2017/automix/basic/rx101_l2_a2_near_lam_cat_mb_mlr1e_4_bb_mlr0_4xb64_ep100.py
@@ -1,4 +1,4 @@
-_base_ = "r50_l2_a2_near_lam_cat_mb_mlr1e_4_bb_mlr0_4xb64.py"
+_base_ = "r50_l2_a2_near_lam_cat_mb_mlr1e_4_bb_mlr0_4xb64_ep100.py"
 
 # model settings
 model = dict(
diff --git a/configs/classification/inaturalist2017/samix/basic/r18_l2_a2_bili_val_dp01_mb_mlr1e_3_bb_mlr0_4xb64_ep100.py b/configs/classification/inaturalist2017/samix/basic/r18_l2_a2_bili_val_dp01_mb_mlr1e_3_bb_mlr0_4xb64_ep100.py
new file mode 100644
index 00000000..dd9ec414
--- /dev/null
+++ b/configs/classification/inaturalist2017/samix/basic/r18_l2_a2_bili_val_dp01_mb_mlr1e_3_bb_mlr0_4xb64_ep100.py
@@ -0,0 +1,62 @@
+_base_ = "r50_l2_a2_bili_val_dp01_mb_mlr1e_4_bb_mlr0_4xb64_ep100.py"
+
+# value_neck_cfg
+conv1x1=dict(
+    type="ConvNeck",
+    in_channels=256, hid_channels=128, out_channels=1,  # MixBlock v
+    num_layers=2, kernel_size=1,
+    with_last_norm=False, norm_cfg=dict(type='BN'),  # default
+    with_last_dropout=0.1, with_avg_pool=False, with_residual=False)  # no res + dropout
+
+# model settings
+model = dict(
+    type='AutoMixup',
+    pretrained=None,
+    alpha=2.0,
+    momentum=0.999,  # 0.999 to 0.99999
+    mask_layer=2,
+    mask_loss=0.1,  # using mask loss
+    mask_adjust=0,
+    lam_margin=0.08,  # degenerate to mixup when lam or 1-lam <= 0.08
+    mask_up_override=None,  # If not none, override upsampling when train MixBlock
+    debug=False,  # show attention and content map
+    backbone=dict(
+        type='ResNet',
+        depth=18,
+        num_stages=4,
+        out_indices=(2,3),  # stage-3 for MixBlock, x-1: stage-x
+        style='pytorch'),
+    mix_block = dict(  # SAMix
+        type='PixelMixBlock',
+        in_channels=256, reduction=2, use_scale=True,
+        unsampling_mode=['bilinear',],  # str or list, tricks in SAMix
+        lam_concat=False, lam_concat_v=False,  # AutoMix.V1: none
+        lam_mul=True, lam_residual=True, lam_mul_k=-1,  # SAMix lam: mult + k=-1 (-1 for large datasets)
+        value_neck_cfg=conv1x1,  # SAMix: non-linear value
+        x_qk_concat=True, x_v_concat=False,  # SAMix x concat: q,k
+        # att_norm_cfg=dict(type='BN'),  # norm after q,k (design for fp16, also conduct better performace in fp32)
+        mask_loss_mode="L1+Variance", mask_loss_margin=0.1,  # L1+Var loss, tricks in SAMix
+        frozen=False),
+    head_one=dict(
+        type='ClsHead',  # default CE
+        loss=dict(type='CrossEntropyLoss', use_soft=False, use_sigmoid=False, loss_weight=1.0),
+        with_avg_pool=True, multi_label=False, in_channels=512, num_classes=5089),
+    head_mix=dict(  # backbone
+        type='ClsMixupHead',  # mixup, default CE
+        loss=dict(type='CrossEntropyLoss', use_soft=False, use_sigmoid=False, loss_weight=1.0),
+        with_avg_pool=True, multi_label=False, in_channels=512, num_classes=5089),
+    head_mix_k=dict(  # mixblock
+        type='ClsMixupHead',  # mixup, soft CE (onehot encoding)
+        loss=dict(type='CrossEntropyLoss', use_soft=True, use_sigmoid=False, loss_weight=1.0),
+        with_avg_pool=True, multi_label=True,
+        neg_weight=1,  # try neg (eta in SAMix)
+        in_channels=512, num_classes=5089),
+    head_weights=dict(
+        head_mix_q=1, head_one_q=1, head_mix_k=1, head_one_k=1),
+)
+
+# additional scheduler
+addtional_scheduler = dict(
+    policy='CosineAnnealing', min_lr=1e-3,
+    paramwise_options=['mix_block'],
+)
diff --git a/configs/classification/inaturalist2017/samix/basic/r50_l2_a2_bili_val_dp01_mb_mlr1e_4_bb_mlr0_4xb64.py b/configs/classification/inaturalist2017/samix/basic/r50_l2_a2_bili_val_dp01_mb_mlr1e_4_bb_mlr0_4xb64_ep100.py
similarity index 97%
rename from configs/classification/inaturalist2017/samix/basic/r50_l2_a2_bili_val_dp01_mb_mlr1e_4_bb_mlr0_4xb64.py
rename to configs/classification/inaturalist2017/samix/basic/r50_l2_a2_bili_val_dp01_mb_mlr1e_4_bb_mlr0_4xb64_ep100.py
index 34fb34f1..9744f92a 100644
--- a/configs/classification/inaturalist2017/samix/basic/r50_l2_a2_bili_val_dp01_mb_mlr1e_4_bb_mlr0_4xb64.py
+++ b/configs/classification/inaturalist2017/samix/basic/r50_l2_a2_bili_val_dp01_mb_mlr1e_4_bb_mlr0_4xb64_ep100.py
@@ -39,7 +39,6 @@
         x_qk_concat=True, x_v_concat=False,  # SAMix x concat: q,k
         # att_norm_cfg=dict(type='BN'),  # norm after q,k (design for fp16, also conduct better performace in fp32)
         mask_loss_mode="L1+Variance", mask_loss_margin=0.1,  # L1+Var loss, tricks in SAMix
-        mask_mode="none_v_",
         frozen=False),
     head_one=dict(
         type='ClsHead',  # default CE
@@ -62,7 +61,7 @@
 # additional hooks
 custom_hooks = [
     dict(type='SAVEHook',
-        save_interval=22630,  # plot every 2263 x 10ep
+        save_interval=2263 * 10,  # plot every 2263 x 10ep
         iter_per_epoch=2263,
     ),
     dict(type='CustomCosineAnnealingHook',  # 0.1 to 0
diff --git a/configs/classification/inaturalist2017/samix/basic/rx101_l2_a2_bili_val_dp01_mb_mlr1e_4_bb_mlr0_4xb64.py b/configs/classification/inaturalist2017/samix/basic/rx101_l2_a2_bili_val_dp01_mb_mlr1e_4_bb_mlr0_4xb64.py
index 2d925d46..eb054be0 100644
--- a/configs/classification/inaturalist2017/samix/basic/rx101_l2_a2_bili_val_dp01_mb_mlr1e_4_bb_mlr0_4xb64.py
+++ b/configs/classification/inaturalist2017/samix/basic/rx101_l2_a2_bili_val_dp01_mb_mlr1e_4_bb_mlr0_4xb64.py
@@ -1,4 +1,4 @@
-_base_ = "r50_l2_a2_bili_val_dp01_mb_mlr1e_4_bb_mlr0_4xb64.py"
+_base_ = "r50_l2_a2_bili_val_dp01_mb_mlr1e_4_bb_mlr0_4xb64_ep100.py"
 
 # model settings
 model = dict(
diff --git a/configs/classification/inaturalist2018/README.md b/configs/classification/inaturalist2018/README.md
new file mode 100644
index 00000000..e22ce614
--- /dev/null
+++ b/configs/classification/inaturalist2018/README.md
@@ -0,0 +1,61 @@
+# Mixup Classification Benchmark on iNaturalist-2018
+
+> [The iNaturalist Species Classification and Detection Dataset](https://arxiv.org/abs/1707.06642)
+
+## Abstract
+
+Existing image classification datasets used in computer vision tend to have an even number of images for each object category. In contrast, the natural world is heavily imbalanced, as some species are more abundant and easier to photograph than others. To encourage further progress in challenging real world conditions we present the iNaturalist Challenge 2017 dataset - an image classification benchmark consisting of 675,000 images with over 5,000 different species of plants and animals. It features many visually similar species, captured in a wide variety of situations, from all over the world. Images were collected with different camera types, have varying image quality, have been verified by multiple citizen scientists, and feature a large class imbalance. We discuss the collection of the dataset and present baseline results for state-of-the-art computer vision classification models. Results show that current non-ensemble based methods achieve only 64% top one classification accuracy, illustrating the difficulty of the dataset. Finally, we report results from a competition that was held with the data.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/44519745/185646160-b61dcad6-02b7-48c8-9f41-8abee8449c2d.png" width="100%"/>
+</div>
+
+## Results and models
+
+We provide a collection of [weights and logs](https://github.com/Westlake-AI/openmixup/releases/tag/mixup-inat2018-weights) for mixup classification benchmark on iNaturalist-2018. You can download all results from **Baidu Cloud**: [iNaturalist-2018 (wy2v)](https://pan.baidu.com/s/1P4VeJalFLV0chryjYCfveg).
+
+* All compared methods adopt ResNet-50 and ResNeXt-101 (32x4d) architectures and are trained 100 epochs using the PyTorch training recipe. The training and testing image size is 224 with the CenterCrop ratio of 0.85. We search $\alpha$ in $Beta(\alpha, \alpha)$ for all compared methods.
+* The **median** of top-1 accuracy in the last 5 training epochs is reported for ResNet variants.
+* Visualization of mixed samples from [AutoMix](https://arxiv.org/abs/2103.13027) and [SAMix](https://arxiv.org/abs/2111.15454) are provided in zip files.
+
+### iNaturalist-2018
+
+| Backbones                                                   | ResNet-50 top-1 | ResNeXt-101 top-1 |
+|-------------------------------------------------------------|:---------------:|:-----------------:|
+| Vanilla                                                     |      62.53      |       66.94       |
+| MixUp [[ICLR'2018](https://arxiv.org/abs/1710.09412)]       |      62.69      |       67.56       |
+| CutMix [[ICCV'2019](https://arxiv.org/abs/1905.04899)]      |      63.91      |       69.75       |
+| ManifoldMix [[ICML'2019](https://arxiv.org/abs/1806.05236)] |      63.46      |       69.30       |
+| SaliencyMix [[ICLR'2021](https://arxiv.org/abs/2006.01791)] |      64.27      |       70.01       |
+| FMix [[Arixv'2020](https://arxiv.org/abs/2002.12047)]       |      63.71      |       69.46       |
+| PuzzleMix [[ICML'2020](https://arxiv.org/abs/2009.06962)]   |      64.36      |       70.12       |
+| ResizeMix [[Arixv'2020](https://arxiv.org/abs/2012.11101)]  |      64.12      |       69.30       |
+| AutoMix [[ECCV'2022](https://arxiv.org/abs/2103.13027)]     |      64.73      |       70.49       |
+| SAMix [[Arxiv'2021](https://arxiv.org/abs/2111.15454)]      |      64.84      |       70.54       |
+
+We summarize mixup benchmarks in [Model Zoo](https://github.com/Westlake-AI/openmixup/tree/main/docs/en/model_zoos/Model_Zoo_sup.md).
+
+
+## Citation
+
+Please refer to the original paper of iNaturalist datasets and AutoMix for details.
+
+```bibtex
+@article{Horn2018TheIS,
+  title={The iNaturalist Species Classification and Detection Dataset},
+  author={Grant Van Horn and Oisin Mac Aodha and Yang Song and Yin Cui and Chen Sun and Alexander Shepard and Hartwig Adam and Pietro Perona and Serge J. Belongie},
+  journal={2018 IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  year={2018},
+  pages={8769-8778}
+}
+```
+```bibtex
+@misc{eccv2022automix,
+  title={AutoMix: Unveiling the Power of Mixup for Stronger Classifiers},
+  author={Zicheng Liu and Siyuan Li and Di Wu and Zhiyuan Chen and Lirong Wu and Jianzhu Guo and Stan Z. Li},
+  year={2021},
+  eprint={2103.13027},
+  archivePrefix={arXiv},
+  primaryClass={cs.CV}
+}
+```
diff --git a/configs/classification/inaturalist2018/automix/basic/r50_l2_a2_near_lam_cat_mb_mlr1e_4_bb_mlr0_4xb64_ep100.py b/configs/classification/inaturalist2018/automix/basic/r50_l2_a2_near_lam_cat_mb_mlr1e_4_bb_mlr0_4xb64_ep100.py
new file mode 100644
index 00000000..c7a3cb2d
--- /dev/null
+++ b/configs/classification/inaturalist2018/automix/basic/r50_l2_a2_near_lam_cat_mb_mlr1e_4_bb_mlr0_4xb64_ep100.py
@@ -0,0 +1,80 @@
+_base_ = [
+    '../../../_base_/datasets/iNaturalist2018/basic_sz224_4xbs64.py',
+    '../../../_base_/default_runtime.py',
+]
+
+# model settings
+model = dict(
+    type='AutoMixup',
+    pretrained=None,
+    alpha=2.0,
+    momentum=0.999,  # 0.999 to 0.99999
+    mask_layer=2,
+    mask_loss=0.1,  # using mask loss
+    mask_adjust=0,
+    lam_margin=0.08,  # degenerate to mixup when lam or 1-lam <= 0.08
+    mask_up_override=None,  # If not none, override upsampling when train MixBlock
+    debug=False,  # show attention and content map
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(2,3),  # stage-3 for MixBlock, x-1: stage-x
+        style='pytorch'),
+    mix_block = dict(  # AutoMix
+        type='PixelMixBlock',
+        in_channels=1024, reduction=2, use_scale=True,
+        unsampling_mode=['nearest',],  # str or list, train & test MixBlock
+        lam_concat=True, lam_concat_v=False,  # AutoMix: lam cat q,k,v
+        lam_mul=False, lam_residual=False, lam_mul_k=-1,  # SAMix lam: none
+        value_neck_cfg=None,  # SAMix: non-linear value
+        x_qk_concat=False, x_v_concat=False,  # SAMix x concat: none
+        # att_norm_cfg=dict(type='BN'),  # norm after q,k (design for fp16, also conduct better performace in fp32)
+        mask_loss_mode="L1", mask_loss_margin=0.1,  # L1 loss, 0.1
+        frozen=False),
+    head_one=dict(
+        type='ClsHead',  # default CE
+        loss=dict(type='CrossEntropyLoss', use_soft=False, use_sigmoid=False, loss_weight=1.0),
+        with_avg_pool=True, multi_label=False, in_channels=2048, num_classes=8142),
+    head_mix=dict(  # backbone & mixblock
+        type='ClsMixupHead',  # mixup, default CE
+        loss=dict(type='CrossEntropyLoss', use_soft=False, use_sigmoid=False, loss_weight=1.0),
+        with_avg_pool=True, multi_label=False, in_channels=2048, num_classes=8142),
+    head_weights=dict(
+        head_mix_q=1, head_one_q=1, head_mix_k=1, head_one_k=1),
+)
+
+# additional hooks
+custom_hooks = [
+    dict(type='SAVEHook',
+        save_interval=1710 * 10,  # plot every 10ep
+        iter_per_epoch=1710,
+    ),
+    dict(type='CustomCosineAnnealingHook',  # 0.1 to 0
+        attr_name="mask_loss", attr_base=0.1, by_epoch=False,  # by iter
+        min_attr=0.,
+    ),
+    dict(type='CosineScheduleHook',
+        end_momentum=0.99999,
+        adjust_scope=[0.1, 1.0],
+        warming_up="constant",
+        interval=1)
+]
+# optimizer
+optimizer = dict(type='SGD', lr=0.1, momentum=0.9, weight_decay=0.0001,
+                paramwise_options={
+                    'mix_block': dict(lr=0.1, momentum=0.9)},)  # required parawise_option
+# optimizer args
+optimizer_config = dict(update_interval=1, grad_clip=None)
+
+# learning policy
+lr_config = dict(policy='CosineAnnealing', min_lr=0.)
+
+# additional scheduler
+addtional_scheduler = dict(
+    policy='CosineAnnealing', min_lr=1e-4,
+    paramwise_options=['mix_block'],
+)
+
+# runtime settings
+runner = dict(type='EpochBasedRunner', max_epochs=100)
diff --git a/configs/classification/inaturalist2018/automix/basic/rx101_l2_a2_near_lam_cat_mb_mlr1e_4_bb_mlr0_4xb64_ep100.py b/configs/classification/inaturalist2018/automix/basic/rx101_l2_a2_near_lam_cat_mb_mlr1e_4_bb_mlr0_4xb64_ep100.py
new file mode 100644
index 00000000..a473f5db
--- /dev/null
+++ b/configs/classification/inaturalist2018/automix/basic/rx101_l2_a2_near_lam_cat_mb_mlr1e_4_bb_mlr0_4xb64_ep100.py
@@ -0,0 +1,12 @@
+_base_ = "r50_l2_a2_near_lam_cat_mb_mlr1e_4_bb_mlr0_4xb64_ep100.py"
+
+# model settings
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        num_stages=4,
+        groups=32, width_per_group=4,  # 32x4d
+        out_indices=(2,3),  # stage-3 for MixBlock, x-1: stage-x
+        style='pytorch'),
+)
diff --git a/configs/classification/inaturalist2018/mixups/basic/r18_mixups_CE_none_4xb64.py b/configs/classification/inaturalist2018/mixups/basic/r18_mixups_CE_none_4xb64.py
new file mode 100644
index 00000000..5e90c378
--- /dev/null
+++ b/configs/classification/inaturalist2018/mixups/basic/r18_mixups_CE_none_4xb64.py
@@ -0,0 +1,16 @@
+_base_ = "r50_mixups_CE_none_4xb64.py"
+
+# model settings
+model = dict(
+    backbone=dict(
+        type='ResNet',  # normal
+        # type='ResNet_Mix',  # required by 'manifoldmix'
+        depth=18,
+        num_stages=4,
+        out_indices=(3,),  # no conv-1, x-1: stage-x
+        style='pytorch'),
+    head=dict(
+        type='ClsHead',  # mixup head, normal CE loss
+        loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
+        with_avg_pool=True, multi_label=False, in_channels=512, num_classes=8142)
+)
diff --git a/configs/classification/inaturalist2018/mixups/basic/r50_mixups_CE_none_4xb64.py b/configs/classification/inaturalist2018/mixups/basic/r50_mixups_CE_none_4xb64.py
new file mode 100644
index 00000000..93f1b518
--- /dev/null
+++ b/configs/classification/inaturalist2018/mixups/basic/r50_mixups_CE_none_4xb64.py
@@ -0,0 +1,43 @@
+_base_ = [
+    '../../../_base_/datasets/iNaturalist2018/basic_sz224_4xbs64.py',
+    '../../../_base_/default_runtime.py',
+]
+
+# model settings
+model = dict(
+    type='MixUpClassification',
+    pretrained=None,
+    alpha=1,  # float or list
+    mix_mode="mixup",  # str or list, choose a mixup mode
+    mix_args=dict(
+        attentivemix=dict(grid_size=32, top_k=None, beta=8),  # AttentiveMix+ in this repo (use pre-trained)
+        automix=dict(mask_adjust=0, lam_margin=0),  # require pre-trained mixblock
+        fmix=dict(decay_power=3, size=(224,224), max_soft=0., reformulate=False),
+        manifoldmix=dict(layer=(0, 3)),
+        puzzlemix=dict(transport=True, t_batch_size=32, t_size=-1,  # adjust t_batch_size if CUDA out of memory
+            mp=None, block_num=4,  # block_num<=4 and mp=2/4 for fast training
+            beta=1.2, gamma=0.5, eta=0.2, neigh_size=4, n_labels=3, t_eps=0.8),
+        resizemix=dict(scope=(0.1, 0.8), use_alpha=True),
+        samix=dict(mask_adjust=0, lam_margin=0.08),  # require pre-trained mixblock
+    ),
+    backbone=dict(
+        # type='ResNet',  # normal
+        type='ResNet_Mix',  # required by 'manifoldmix'
+        depth=50,
+        num_stages=4,
+        out_indices=(3,),  # no conv-1, x-1: stage-x
+        style='pytorch'),
+    head=dict(
+        type='ClsHead',  # mixup head, normal CE loss
+        loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
+        with_avg_pool=True, multi_label=False, in_channels=2048, num_classes=8142)
+)
+
+# optimizer
+optimizer = dict(type='SGD', lr=0.1, momentum=0.9, weight_decay=0.0001)
+
+# lr scheduler
+lr_config = dict(policy='CosineAnnealing', min_lr=0)
+
+# runtime settings
+runner = dict(type='EpochBasedRunner', max_epochs=100)
diff --git a/configs/classification/inaturalist2018/mixups/basic/rx101_mixups_CE_none_4xb64.py b/configs/classification/inaturalist2018/mixups/basic/rx101_mixups_CE_none_4xb64.py
new file mode 100644
index 00000000..94e326fd
--- /dev/null
+++ b/configs/classification/inaturalist2018/mixups/basic/rx101_mixups_CE_none_4xb64.py
@@ -0,0 +1,16 @@
+_base_ = "r50_mixups_CE_none_4xb64.py"
+
+# model settings
+model = dict(
+    backbone=dict(
+        # type='ResNeXt',  # normal
+        type='ResNeXt_Mix',  # required by 'manifoldmix'
+        depth=101,
+        groups=32, width_per_group=4,  # 32x4d
+        out_indices=(3,),  # no conv-1, x-1: stage-x
+        style='pytorch'),
+    head=dict(
+        type='ClsHead',  # mixup head, normal CE loss
+        loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
+        with_avg_pool=True, multi_label=False, in_channels=2048, num_classes=8142)
+)
diff --git a/configs/classification/inaturalist2018/samix/basic/r50_l2_a2_bili_val_dp01_mb_mlr1e_4_bb_mlr0_4xb64_ep100.py b/configs/classification/inaturalist2018/samix/basic/r50_l2_a2_bili_val_dp01_mb_mlr1e_4_bb_mlr0_4xb64_ep100.py
new file mode 100644
index 00000000..a5709d9f
--- /dev/null
+++ b/configs/classification/inaturalist2018/samix/basic/r50_l2_a2_bili_val_dp01_mb_mlr1e_4_bb_mlr0_4xb64_ep100.py
@@ -0,0 +1,94 @@
+_base_ = [
+    '../../../_base_/datasets/iNaturalist2018/basic_sz224_4xbs64.py',
+    '../../../_base_/default_runtime.py',
+]
+
+# value_neck_cfg
+conv1x1=dict(
+    type="ConvNeck",
+    in_channels=1024, hid_channels=512, out_channels=1,  # MixBlock v
+    num_layers=2, kernel_size=1,
+    with_last_norm=False, norm_cfg=dict(type='BN'),  # default
+    with_last_dropout=0.1, with_avg_pool=False, with_residual=False)  # no res + dropout
+
+# model settings
+model = dict(
+    type='AutoMixup',
+    pretrained=None,
+    alpha=2.0,
+    momentum=0.999,  # 0.999 to 0.99999
+    mask_layer=2,
+    mask_loss=0.1,  # using mask loss
+    mask_adjust=0,
+    lam_margin=0.08,  # degenerate to mixup when lam or 1-lam <= 0.08
+    mask_up_override=None,  # If not none, override upsampling when train MixBlock
+    debug=False,  # show attention and content map
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(2,3),  # stage-3 for MixBlock, x-1: stage-x
+        style='pytorch'),
+    mix_block = dict(  # SAMix
+        type='PixelMixBlock',
+        in_channels=1024, reduction=2, use_scale=True,
+        unsampling_mode=['bilinear',],  # str or list, tricks in SAMix
+        lam_concat=False, lam_concat_v=False,  # AutoMix.V1: none
+        lam_mul=True, lam_residual=True, lam_mul_k=-1,  # SAMix lam: mult + k=-1 (-1 for large datasets)
+        value_neck_cfg=conv1x1,  # SAMix: non-linear value
+        x_qk_concat=True, x_v_concat=False,  # SAMix x concat: q,k
+        # att_norm_cfg=dict(type='BN'),  # norm after q,k (design for fp16, also conduct better performace in fp32)
+        mask_loss_mode="L1+Variance", mask_loss_margin=0.1,  # L1+Var loss, tricks in SAMix
+        frozen=False),
+    head_one=dict(
+        type='ClsHead',  # default CE
+        loss=dict(type='CrossEntropyLoss', use_soft=False, use_sigmoid=False, loss_weight=1.0),
+        with_avg_pool=True, multi_label=False, in_channels=2048, num_classes=8142),
+    head_mix=dict(  # backbone
+        type='ClsMixupHead',  # mixup, default CE
+        loss=dict(type='CrossEntropyLoss', use_soft=False, use_sigmoid=False, loss_weight=1.0),
+        with_avg_pool=True, multi_label=False, in_channels=2048, num_classes=8142),
+    head_mix_k=dict(  # mixblock
+        type='ClsMixupHead',  # mixup, soft CE (onehot encoding)
+        loss=dict(type='CrossEntropyLoss', use_soft=True, use_sigmoid=False, loss_weight=1.0),
+        with_avg_pool=True, multi_label=True,
+        neg_weight=1,  # try neg (eta in SAMix)
+        in_channels=2048, num_classes=8142),
+    head_weights=dict(
+        head_mix_q=1, head_one_q=1, head_mix_k=1, head_one_k=1),
+)
+
+# additional hooks
+custom_hooks = [
+    dict(type='SAVEHook',
+        save_interval=1710 * 10,  # plot every 10ep
+        iter_per_epoch=1710,
+    ),
+    dict(type='CustomCosineAnnealingHook',  # 0.1 to 0
+        attr_name="mask_loss", attr_base=0.1, by_epoch=False,  # by iter
+        min_attr=0.,
+    ),
+    dict(type='CosineScheduleHook',
+        end_momentum=0.99999,
+        adjust_scope=[0.1, 1.0],
+        warming_up="constant",
+        interval=1)
+]
+# optimizer
+optimizer = dict(type='SGD', lr=0.1, momentum=0.9, weight_decay=0.0001,
+                paramwise_options={
+                    'mix_block': dict(lr=0.1, momentum=0.9)},)  # required parawise_option
+# optimizer args
+optimizer_config = dict(update_interval=1, grad_clip=None)
+
+# learning policy
+lr_config = dict(policy='CosineAnnealing', min_lr=0.)
+
+# additional scheduler
+addtional_scheduler = dict(
+    policy='CosineAnnealing', min_lr=1e-4,
+    paramwise_options=['mix_block'],
+)
+
+# runtime settings
+runner = dict(type='EpochBasedRunner', max_epochs=100)
diff --git a/configs/classification/inaturalist2018/samix/basic/rx101_l2_a2_bili_val_dp01_mb_mlr1e_4_bb_mlr0_4xb64_ep100.py b/configs/classification/inaturalist2018/samix/basic/rx101_l2_a2_bili_val_dp01_mb_mlr1e_4_bb_mlr0_4xb64_ep100.py
new file mode 100644
index 00000000..eb054be0
--- /dev/null
+++ b/configs/classification/inaturalist2018/samix/basic/rx101_l2_a2_bili_val_dp01_mb_mlr1e_4_bb_mlr0_4xb64_ep100.py
@@ -0,0 +1,12 @@
+_base_ = "r50_l2_a2_bili_val_dp01_mb_mlr1e_4_bb_mlr0_4xb64_ep100.py"
+
+# model settings
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        num_stages=4,
+        groups=32, width_per_group=4,  # 32x4d
+        out_indices=(2,3),  # stage-3 for MixBlock, x-1: stage-x
+        style='pytorch'),
+)
diff --git a/configs/classification/place205/README.md b/configs/classification/place205/README.md
new file mode 100644
index 00000000..a8181e9c
--- /dev/null
+++ b/configs/classification/place205/README.md
@@ -0,0 +1,61 @@
+# Mixup Classification Benchmark on Place205
+
+> [Places: A 10 million Image Database for Scene Recognition](http://places2.csail.mit.edu/PAMI_places.pdf)
+
+## Abstract
+
+The rise of multi-million-item dataset initiatives has enabled data-hungry machine learning algorithms to reach near-human semantic classification performance at tasks such as visual object and scene recognition. Here we describe the Places Database, a repository of 10 million scene photographs, labeled with scene semantic categories, comprising a large and diverse list of the types of environments encountered in the world. Using the state-of-the-art Convolutional Neural Networks (CNNs), we provide scene classification CNNs (Places-CNNs) as baselines, that significantly outperform the previous approaches. Visualization of the CNNs trained on Places shows that object detectors emerge as an intermediate representation of scene classification. With its high-coverage and high-diversity of exemplars, the Places Database along with the Places-CNNs offer a novel resource to guide future progress on scene recognition problems.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/44519745/185649984-7e82d3d5-1ef5-49c5-b08f-06b4d98bb4c5.png" width="100%"/>
+</div>
+
+## Results and models
+
+We provide a collection of [weights and logs](https://github.com/Westlake-AI/openmixup/releases/tag/mixup-place205-weights) for mixup classification benchmark on Place205. You can download all results from **Baidu Cloud**: [Place205 (4m94)](https://pan.baidu.com/s/1ciAYxK6SwR13UNScp0W3bQ).
+
+* All compared methods adopt ResNet-18/50 architectures and are trained 100 epochs using the PyTorch training recipe. The training and testing image size is 224 with the CenterCrop ratio of 0.85. We search $\alpha$ in $Beta(\alpha, \alpha)$ for all compared methods.
+* The **median** of top-1 accuracy in the last 5 training epochs is reported for ResNet-18/50.
+* Visualization of mixed samples from [AutoMix](https://arxiv.org/abs/2103.13027) and [SAMix](https://arxiv.org/abs/2111.15454) are provided in zip files.
+
+### Place-205
+
+| Backbones                                                   | ResNet-18 top-1 | ResNet-50 top-1 |
+|-------------------------------------------------------------|:---------------:|:---------------:|
+| Vanilla                                                     |      59.63      |      63.10      |
+| MixUp [[ICLR'2018](https://arxiv.org/abs/1710.09412)]       |      59.33      |      63.01      |
+| CutMix [[ICCV'2019](https://arxiv.org/abs/1905.04899)]      |      59.21      |      63.75      |
+| ManifoldMix [[ICML'2019](https://arxiv.org/abs/1806.05236)] |      59.46      |      63.23      |
+| SaliencyMix [[ICLR'2021](https://arxiv.org/abs/2006.01791)] |      59.50      |      63.33      |
+| FMix [[Arixv'2020](https://arxiv.org/abs/2002.12047)]       |      59.51      |      63.63      |
+| PuzzleMix [[ICML'2020](https://arxiv.org/abs/2009.06962)]   |      59.62      |      63.91      |
+| ResizeMix [[Arixv'2020](https://arxiv.org/abs/2012.11101)]  |      59.66      |      63.88      |
+| AutoMix [[ECCV'2022](https://arxiv.org/abs/2103.13027)]     |      59.74      |      64.06      |
+| SAMix [[Arxiv'2021](https://arxiv.org/abs/2111.15454)]      |      59.86      |      64.27      |
+
+We summarize mixup benchmarks in [Model Zoo](https://github.com/Westlake-AI/openmixup/tree/main/docs/en/model_zoos/Model_Zoo_sup.md).
+
+
+## Citation
+
+Please refer to the original paper of Place205 and AutoMix for details.
+
+```bibtex
+@article{zhou2017places,
+  title={Places: A 10 million Image Database for Scene Recognition},
+  author={Zhou, Bolei and Lapedriza, Agata and Khosla, Aditya and Oliva, Aude and Torralba, Antonio},
+  journal={IEEE Transactions on Pattern Analysis and Machine Intelligence},
+  year={2017},
+  publisher={IEEE}
+}
+```
+```bibtex
+@misc{eccv2022automix,
+  title={AutoMix: Unveiling the Power of Mixup for Stronger Classifiers},
+  author={Zicheng Liu and Siyuan Li and Di Wu and Zhiyuan Chen and Lirong Wu and Jianzhu Guo and Stan Z. Li},
+  year={2021},
+  eprint={2103.13027},
+  archivePrefix={arXiv},
+  primaryClass={cs.CV}
+}
+```
diff --git a/configs/classification/place205/automix/basic/r18_l2_a2_near_lam_cat_mb_mlr1e_3_bb_mlr0_4xb64.py b/configs/classification/place205/automix/basic/r18_l2_a2_near_lam_cat_mb_mlr1e_3_bb_mlr0_4xb64_ep100.py
similarity index 94%
rename from configs/classification/place205/automix/basic/r18_l2_a2_near_lam_cat_mb_mlr1e_3_bb_mlr0_4xb64.py
rename to configs/classification/place205/automix/basic/r18_l2_a2_near_lam_cat_mb_mlr1e_3_bb_mlr0_4xb64_ep100.py
index b4fb6933..9934b5f0 100644
--- a/configs/classification/place205/automix/basic/r18_l2_a2_near_lam_cat_mb_mlr1e_3_bb_mlr0_4xb64.py
+++ b/configs/classification/place205/automix/basic/r18_l2_a2_near_lam_cat_mb_mlr1e_3_bb_mlr0_4xb64_ep100.py
@@ -1,4 +1,4 @@
-_base_ = "r50_l2_a2_near_lam_cat_mb_mlr1e_3_bb_mlr0_4xb64.py"
+_base_ = "r50_l2_a2_near_lam_cat_mb_mlr1e_3_bb_mlr0_4xb64_ep100.py"
 
 # model settings
 model = dict(
@@ -18,7 +18,6 @@
         x_qk_concat=False, x_v_concat=False,  # SAMix x concat: none
         # att_norm_cfg=dict(type='BN'),  # norm after q,k (design for fp16, also conduct better performace in fp32)
         mask_loss_mode="L1", mask_loss_margin=0.1,  # L1 loss, 0.1
-        mask_mode="none_v_",
         frozen=False),
     head_one=dict(
         type='ClsHead',  # default CE
diff --git a/configs/classification/place205/automix/basic/r50_l2_a2_near_lam_cat_mb_mlr1e_3_bb_mlr0_4xb64.py b/configs/classification/place205/automix/basic/r50_l2_a2_near_lam_cat_mb_mlr1e_3_bb_mlr0_4xb64_ep100.py
similarity index 99%
rename from configs/classification/place205/automix/basic/r50_l2_a2_near_lam_cat_mb_mlr1e_3_bb_mlr0_4xb64.py
rename to configs/classification/place205/automix/basic/r50_l2_a2_near_lam_cat_mb_mlr1e_3_bb_mlr0_4xb64_ep100.py
index b3546cf2..5964a442 100644
--- a/configs/classification/place205/automix/basic/r50_l2_a2_near_lam_cat_mb_mlr1e_3_bb_mlr0_4xb64.py
+++ b/configs/classification/place205/automix/basic/r50_l2_a2_near_lam_cat_mb_mlr1e_3_bb_mlr0_4xb64_ep100.py
@@ -31,7 +31,6 @@
         x_qk_concat=False, x_v_concat=False,  # SAMix x concat: none
         # att_norm_cfg=dict(type='BN'),  # norm after q,k (design for fp16, also conduct better performace in fp32)
         mask_loss_mode="L1", mask_loss_margin=0.1,  # L1 loss, 0.1
-        mask_mode="none_v_",
         frozen=False),
     head_one=dict(
         type='ClsHead',  # default CE
diff --git a/configs/classification/place205/samix/r18_l2_a2_bili_val_dp01_mb_mlr1e_4_bb_mlr0_4xb64_ep100.py b/configs/classification/place205/samix/r18_l2_a2_bili_val_dp01_mb_mlr1e_4_bb_mlr0_4xb64_ep100.py
new file mode 100644
index 00000000..eebd6213
--- /dev/null
+++ b/configs/classification/place205/samix/r18_l2_a2_bili_val_dp01_mb_mlr1e_4_bb_mlr0_4xb64_ep100.py
@@ -0,0 +1,50 @@
+_base_ = "r50_l2_a2_bili_val_dp01_mb_mlr1e_4_bb_mlr0_4xb64_ep100.py"
+
+# value_neck_cfg
+conv1x1=dict(
+    type="ConvNeck",
+    in_channels=256, hid_channels=128, out_channels=1,  # MixBlock v
+    num_layers=2, kernel_size=1,
+    with_last_norm=False, norm_cfg=dict(type='BN'),  # default
+    with_last_dropout=0.1, with_avg_pool=False, with_residual=False)  # no res + dropout
+
+# model settings
+model = dict(
+    type='AutoMixup',
+    pretrained=None,
+    alpha=2.0,
+    momentum=0.999,  # 0.999 to 0.99999
+    mask_layer=2,
+    mask_loss=0.1,  # using mask loss
+    mask_adjust=0,
+    lam_margin=0.08,  # degenerate to mixup when lam or 1-lam <= 0.08
+    mask_up_override=None,  # If not none, override upsampling when train MixBlock
+    debug=False,  # show attention and content map
+    backbone=dict(
+        type='ResNet',
+        depth=18,
+        num_stages=4,
+        out_indices=(2,3),  # stage-3 for MixBlock, x-1: stage-x
+        style='pytorch'),
+    mix_block = dict(  # SAMix
+        type='PixelMixBlock',
+        in_channels=256, reduction=2, use_scale=True,
+        unsampling_mode=['bilinear',],  # str or list, tricks in SAMix
+        lam_concat=False, lam_concat_v=False,  # AutoMix.V1: none
+        lam_mul=True, lam_residual=True, lam_mul_k=-1,  # SAMix lam: mult + k=-1 (-1 for large datasets)
+        value_neck_cfg=conv1x1,  # SAMix: non-linear value
+        x_qk_concat=True, x_v_concat=False,  # SAMix x concat: q,k
+        # att_norm_cfg=dict(type='BN'),  # norm after q,k (design for fp16, also conduct better performace in fp32)
+        mask_loss_mode="L1+Variance", mask_loss_margin=0.1,  # L1+Var loss, tricks in SAMix
+        frozen=False),
+    head_one=dict(
+        type='ClsHead',  # default CE
+        loss=dict(type='CrossEntropyLoss', use_soft=False, use_sigmoid=False, loss_weight=1.0),
+        with_avg_pool=True, multi_label=False, in_channels=512, num_classes=205),
+    head_mix=dict(  # backbone
+        type='ClsMixupHead',  # mixup, default CE
+        loss=dict(type='CrossEntropyLoss', use_soft=False, use_sigmoid=False, loss_weight=1.0),
+        with_avg_pool=True, multi_label=False, in_channels=512, num_classes=205),
+    head_weights=dict(
+        head_mix_q=1, head_one_q=1, head_mix_k=1, head_one_k=1),
+)
diff --git a/configs/classification/place205/samix/r50_l2_a2_bili_val_dp01_mb_mlr1e_4_bb_mlr0_4xb64_ep100.py b/configs/classification/place205/samix/r50_l2_a2_bili_val_dp01_mb_mlr1e_4_bb_mlr0_4xb64_ep100.py
new file mode 100644
index 00000000..5022d3dd
--- /dev/null
+++ b/configs/classification/place205/samix/r50_l2_a2_bili_val_dp01_mb_mlr1e_4_bb_mlr0_4xb64_ep100.py
@@ -0,0 +1,88 @@
+_base_ = [
+    '../../../_base_/datasets/place205/basic_sz224_4xbs64.py',
+    '../../../_base_/default_runtime.py',
+]
+
+# value_neck_cfg
+conv1x1=dict(
+    type="ConvNeck",
+    in_channels=1024, hid_channels=512, out_channels=1,  # MixBlock v
+    num_layers=2, kernel_size=1,
+    with_last_norm=False, norm_cfg=dict(type='BN'),  # default
+    with_last_dropout=0.1, with_avg_pool=False, with_residual=False)  # no res + dropout
+
+# model settings
+model = dict(
+    type='AutoMixup',
+    pretrained=None,
+    alpha=2.0,
+    momentum=0.999,  # 0.999 to 0.99999
+    mask_layer=2,
+    mask_loss=0.1,  # using mask loss
+    mask_adjust=0,
+    lam_margin=0.08,  # degenerate to mixup when lam or 1-lam <= 0.08
+    mask_up_override=None,  # If not none, override upsampling when train MixBlock
+    debug=False,  # show attention and content map
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(2,3),  # stage-3 for MixBlock, x-1: stage-x
+        style='pytorch'),
+    mix_block = dict(  # SAMix
+        type='PixelMixBlock',
+        in_channels=1024, reduction=2, use_scale=True,
+        unsampling_mode=['bilinear',],  # str or list, tricks in SAMix
+        lam_concat=False, lam_concat_v=False,  # AutoMix.V1: none
+        lam_mul=True, lam_residual=True, lam_mul_k=-1,  # SAMix lam: mult + k=-1 (-1 for large datasets)
+        value_neck_cfg=conv1x1,  # SAMix: non-linear value
+        x_qk_concat=True, x_v_concat=False,  # SAMix x concat: q,k
+        # att_norm_cfg=dict(type='BN'),  # norm after q,k (design for fp16, also conduct better performace in fp32)
+        mask_loss_mode="L1+Variance", mask_loss_margin=0.1,  # L1+Var loss, tricks in SAMix
+        frozen=False),
+    head_one=dict(
+        type='ClsHead',  # default CE
+        loss=dict(type='CrossEntropyLoss', use_soft=False, use_sigmoid=False, loss_weight=1.0),
+        with_avg_pool=True, multi_label=False, in_channels=2048, num_classes=205),
+    head_mix=dict(  # backbone
+        type='ClsMixupHead',  # mixup, default CE
+        loss=dict(type='CrossEntropyLoss', use_soft=False, use_sigmoid=False, loss_weight=1.0),
+        with_avg_pool=True, multi_label=False, in_channels=2048, num_classes=205),
+    head_weights=dict(
+        head_mix_q=1, head_one_q=1, head_mix_k=1, head_one_k=1),
+)
+
+# additional hooks
+custom_hooks = [
+    dict(type='SAVEHook',
+        save_interval=9566 * 10,  # plot every 10 ep
+        iter_per_epoch=9566,
+    ),
+    dict(type='CustomCosineAnnealingHook',  # 0.1 to 0
+        attr_name="mask_loss", attr_base=0.1, by_epoch=False,  # by iter
+        min_attr=0.,
+    ),
+    dict(type='CosineScheduleHook',
+        end_momentum=0.99999,
+        adjust_scope=[0.1, 1.0],
+        warming_up="constant",
+        interval=1)
+]
+# optimizer
+optimizer = dict(type='SGD', lr=0.1, momentum=0.9, weight_decay=0.0001,
+                paramwise_options={
+                    'mix_block': dict(lr=0.1, momentum=0.9)},)  # required parawise_option
+# optimizer args
+optimizer_config = dict(update_interval=1, grad_clip=None)
+
+# learning policy
+lr_config = dict(policy='CosineAnnealing', min_lr=0.)
+
+# additional scheduler
+addtional_scheduler = dict(
+    policy='CosineAnnealing', min_lr=1e-4,
+    paramwise_options=['mix_block'],
+)
+
+# runtime settings
+runner = dict(type='EpochBasedRunner', max_epochs=100)
diff --git a/configs/classification/tiny_imagenet/automix/basic/r18_l2_a2_near_mb_mlr1e_3_bb_mlr5e_2.py b/configs/classification/tiny_imagenet/automix/basic/r18_l2_a2_near_mb_mlr1e_3_bb_mlr5e_2.py
index a0665976..47c9898c 100644
--- a/configs/classification/tiny_imagenet/automix/basic/r18_l2_a2_near_mb_mlr1e_3_bb_mlr5e_2.py
+++ b/configs/classification/tiny_imagenet/automix/basic/r18_l2_a2_near_mb_mlr1e_3_bb_mlr5e_2.py
@@ -31,7 +31,6 @@
         x_qk_concat=False, x_v_concat=False,  # SAMix x concat: none
         # att_norm_cfg=dict(type='BN'),  # norm after q,k (design for fp16, also conduct better performace in fp32)
         mask_loss_mode="L1", mask_loss_margin=0.1,  # L1 loss, 0.1
-        mask_mode="none_v_",
         frozen=False),
     head_one=dict(
         type='ClsHead',  # default CE
diff --git a/configs/classification/tiny_imagenet/automix/basic/rx50_l2_a2_near_mb_mlr1e_3_bb_mlr5e_2_2xb50.py b/configs/classification/tiny_imagenet/automix/basic/rx50_l2_a2_near_mb_mlr1e_3_bb_mlr5e_2_2xb50.py
index 7781fbc4..47a71ccf 100644
--- a/configs/classification/tiny_imagenet/automix/basic/rx50_l2_a2_near_mb_mlr1e_3_bb_mlr5e_2_2xb50.py
+++ b/configs/classification/tiny_imagenet/automix/basic/rx50_l2_a2_near_mb_mlr1e_3_bb_mlr5e_2_2xb50.py
@@ -32,7 +32,6 @@
         x_qk_concat=False, x_v_concat=False,  # SAMix x concat: none
         # att_norm_cfg=dict(type='BN'),  # norm after q,k (design for fp16, also conduct better performace in fp32)
         mask_loss_mode="L1", mask_loss_margin=0.1,  # L1 loss, 0.1
-        mask_mode="none_v_",
         frozen=False),
     head_one=dict(
         type='ClsHead',  # default CE
diff --git a/configs/classification/tiny_imagenet/samix/basic/r18/unsampling_modebilinear/r18_l2_a2_bili_val_dp0_mul_mb_mlr1e_3_bb_mlr5e_2_ep400.py b/configs/classification/tiny_imagenet/samix/basic/r18/unsampling_modebilinear/r18_l2_a2_bili_val_dp0_mul_mb_mlr1e_3_bb_mlr5e_2_ep400.py
index 4ba88094..1d322769 100644
--- a/configs/classification/tiny_imagenet/samix/basic/r18/unsampling_modebilinear/r18_l2_a2_bili_val_dp0_mul_mb_mlr1e_3_bb_mlr5e_2_ep400.py
+++ b/configs/classification/tiny_imagenet/samix/basic/r18/unsampling_modebilinear/r18_l2_a2_bili_val_dp0_mul_mb_mlr1e_3_bb_mlr5e_2_ep400.py
@@ -39,7 +39,6 @@
         x_qk_concat=True, x_v_concat=False,  # SAMix x concat: q,k
         # att_norm_cfg=dict(type='BN'),  # norm after q,k (design for fp16, also conduct better performace in fp32)
         mask_loss_mode="L1+Variance", mask_loss_margin=0.1,  # L1+Var loss, tricks in SAMix
-        mask_mode="none_v_",
         frozen=False),
     head_one=dict(
         type='ClsHead',  # soft CE
diff --git a/configs/classification/tiny_imagenet/samix/basic/r18_l2_a2_bili_val_dp0_mul_mb_mlr1e_3_bb_mlr5e_2.py b/configs/classification/tiny_imagenet/samix/basic/r18_l2_a2_bili_val_dp0_mul_mb_mlr1e_3_bb_mlr5e_2.py
index 0f781dfe..8c7a5417 100644
--- a/configs/classification/tiny_imagenet/samix/basic/r18_l2_a2_bili_val_dp0_mul_mb_mlr1e_3_bb_mlr5e_2.py
+++ b/configs/classification/tiny_imagenet/samix/basic/r18_l2_a2_bili_val_dp0_mul_mb_mlr1e_3_bb_mlr5e_2.py
@@ -39,7 +39,6 @@
         x_qk_concat=True, x_v_concat=False,  # SAMix x concat: q,k
         # att_norm_cfg=dict(type='BN'),  # norm after q,k (design for fp16, also conduct better performace in fp32)
         mask_loss_mode="L1+Variance", mask_loss_margin=0.1,  # L1+Var loss, tricks in SAMix
-        mask_mode="none_v_",
         frozen=False),
     head_one=dict(
         type='ClsHead',  # soft CE
diff --git a/configs/classification/tiny_imagenet/samix/basic/rx50/unsampling_modebilinear/rx50_l2_a2_bili_val_dp0_mul_mb_mlr1e_3_bb_mlr5e_2_2xb50_lam_mul_k0_25_mask_adjust0_25_ep400.py b/configs/classification/tiny_imagenet/samix/basic/rx50/unsampling_modebilinear/rx50_l2_a2_bili_val_dp0_mul_mb_mlr1e_3_bb_mlr5e_2_2xb50_lam_mul_k0_25_mask_adjust0_25_ep400.py
index 416d529f..4bd4a129 100644
--- a/configs/classification/tiny_imagenet/samix/basic/rx50/unsampling_modebilinear/rx50_l2_a2_bili_val_dp0_mul_mb_mlr1e_3_bb_mlr5e_2_2xb50_lam_mul_k0_25_mask_adjust0_25_ep400.py
+++ b/configs/classification/tiny_imagenet/samix/basic/rx50/unsampling_modebilinear/rx50_l2_a2_bili_val_dp0_mul_mb_mlr1e_3_bb_mlr5e_2_2xb50_lam_mul_k0_25_mask_adjust0_25_ep400.py
@@ -40,7 +40,6 @@
         x_qk_concat=True, x_v_concat=False,  # SAMix x concat: q,k
         # att_norm_cfg=dict(type='BN'),  # norm after q,k (design for fp16, also conduct better performace in fp32)
         mask_loss_mode="L1+Variance", mask_loss_margin=0.1,  # L1+Var loss, tricks in SAMix
-        mask_mode="none_v_",
         frozen=False),
     head_one=dict(
         type='ClsHead',  # soft CE
diff --git a/configs/classification/tiny_imagenet/samix/basic/rx50_l2_a2_bili_val_dp0_mul_mb_mlr1e_3_bb_mlr5e_2_2xb50.py b/configs/classification/tiny_imagenet/samix/basic/rx50_l2_a2_bili_val_dp0_mul_mb_mlr1e_3_bb_mlr5e_2_2xb50.py
index 2aea947d..9c94cda0 100644
--- a/configs/classification/tiny_imagenet/samix/basic/rx50_l2_a2_bili_val_dp0_mul_mb_mlr1e_3_bb_mlr5e_2_2xb50.py
+++ b/configs/classification/tiny_imagenet/samix/basic/rx50_l2_a2_bili_val_dp0_mul_mb_mlr1e_3_bb_mlr5e_2_2xb50.py
@@ -40,7 +40,6 @@
         x_qk_concat=True, x_v_concat=False,  # SAMix x concat: q,k
         # att_norm_cfg=dict(type='BN'),  # norm after q,k (design for fp16, also conduct better performace in fp32)
         mask_loss_mode="L1+Variance", mask_loss_margin=0.1,  # L1+Var loss, tricks in SAMix
-        mask_mode="none_v_",
         frozen=False),
     head_one=dict(
         type='ClsHead',  # soft CE
diff --git a/docs/en/awesome_mixups/Mixup_SL.md b/docs/en/awesome_mixups/Mixup_SL.md
index dcc759e0..4164c163 100644
--- a/docs/en/awesome_mixups/Mixup_SL.md
+++ b/docs/en/awesome_mixups/Mixup_SL.md
@@ -41,8 +41,16 @@ We are working on a survey of mixup methods. The list of awesome mixup methods i
    - Domain Generalization with MixStyle. [[ICLR'2021](https://openreview.net/forum?id=6xHJ37MVxxp)] [[code](https://github.com/KaiyangZhou/mixstyle-release)]
 * **MoEx**: Boyi Li, Felix Wu, Ser-Nam Lim, Serge Belongie, Kilian Q. Weinberger.
    - On Feature Normalization and Data Augmentation. [[CVPR'2021](https://arxiv.org/abs/2002.11102)] [[code](https://github.com/Boyiliee/MoEx)]
+* **k-Mixup**: Kristjan Greenewald, Anming Gu, Mikhail Yurochkin, Justin Solomon, Edward Chien.
+   - k-Mixup Regularization for Deep Learning via Optimal Transport. [[ArXiv'2021](https://arxiv.org/abs/2106.02933)]
 * **LocalMix**: Raphael Baena, Lucas Drumetz, Vincent Gripon.
-   - Preventing Manifold Intrusion with Locality: Local Mixup. [[AISTATS'2021](https://arxiv.org/abs/2201.04368)]
+   - Preventing Manifold Intrusion with Locality: Local Mixup. [[ArXiv'2022](https://arxiv.org/abs/2201.04368)] [[code](https://github.com/raphael-baena/Local-Mixup)]
+* **RandomMix**: Xiaoliang Liu, Furao Shen, Jian Zhao, Changhai Nie.
+   - RandomMix: A mixed sample data augmentation method with multiple mixed modes. [[ArXiv’2022](https://arxiv.org/abs/2205.08728)]
+* **SuperpixelGridCut**: Karim Hammoudi, Adnane Cabani, Bouthaina Slika, Halim Benhabiles, Fadi Dornaika, Mahmoud Melkemi.
+   - SuperpixelGridCut, SuperpixelGridMean and SuperpixelGridMix Data Augmentation. [[ArXiv'2022](https://arxiv.org/abs/2204.08458)] [[code](https://github.com/hammoudiproject/SuperpixelGridMasks)]
+* **AugRmixAT**: Xiaoliang Liu, Furao Shen, Jian Zhao, Changhai Nie.
+   - AugRmixAT: A Data Processing and Training Method for Improving Multiple Robustness and Generalization Performance. [[ICME’2022](https://arxiv.org/abs/2207.10290)]
 
 ### Saliency-guided Policies
 
@@ -93,6 +101,8 @@ We are working on a survey of mixup methods. The list of awesome mixup methods i
    - Saliency Grafting: Innocuous Attribution-Guided Mixup with Calibrated Label Mixing. [[AAAI'2022](https://arxiv.org/abs/2112.08796)]
 * **TransMix**: Jie-Neng Chen, Shuyang Sun, Ju He, Philip Torr, Alan Yuille, Song Bai.
    - TransMix: Attend to Mix for Vision Transformers. [[CVPR'2022](https://arxiv.org/abs/2111.09833)] [[code](https://github.com/Beckschen/TransMix)]
+* **GenLabel**: Yeming Wen, Ghassen Jerfel, Rafael Muller, Michael W. Dusenberry, Jasper Snoek, Balaji Lakshminarayanan, Dustin Tran.
+   - GenLabel: Mixup Relabeling using Generative Models. [[ArXiv'2022](https://arxiv.org/abs/2201.02354)]
 * **DecoupleMix**: Zicheng Liu, Siyuan Li, Ge Wang, Cheng Tan, Lirong Wu, Stan Z. Li.
    - Decoupled Mixup for Data-efficient Learning. [[Arxiv'2022](https://arxiv.org/abs/2203.10761),] [[code](https://github.com/Westlake-AI/openmixup)]
 * **TokenMix**: Jihao Liu, Boxiao Liu, Hang Zhou, Hongsheng Li, Yu Liu.
diff --git a/docs/en/awesome_selfsup/MIM.md b/docs/en/awesome_selfsup/MIM.md
index 88edf298..d0bfe7a5 100644
--- a/docs/en/awesome_selfsup/MIM.md
+++ b/docs/en/awesome_selfsup/MIM.md
@@ -47,6 +47,8 @@ The list of awesome MIM methods is summarized in chronological order and is on u
    - Object-wise Masked Autoencoders for Fast Pre-training. [[ArXiv'2022](https://arxiv.org/abs/2205.14338)]
 * **LoMaR**: Jun Chen, Ming Hu, Boyang Li, Mohamed Elhoseiny.
    - Efficient Self-supervised Vision Pretraining with Local Masked Reconstruction. [[ArXiv'2022](https://arxiv.org/abs/2206.00790)] [[code](https://github.com/junchen14/LoMaR)]
+* **BEiT.V2**: Zhiliang Peng, Li Dong, Hangbo Bao, Qixiang Ye, Furu Wei.
+   - BEiT v2: Masked Image Modeling with Vector-Quantized Visual Tokenizers. [[ArXiv'2022](http://arxiv.org/abs/2208.06366)] [[code](https://aka.ms/beit)]
 
 ### MIM with Constrastive Learning
 
@@ -62,6 +64,8 @@ The list of awesome MIM methods is summarized in chronological order and is on u
    - Masked Image Modeling with Denoising Contrast. [[ArXiv'2022](https://arxiv.org/abs/2205.09616)]
 * **RePre**: Luya Wang, Feng Liang, Yangguang Li, Honggang Zhang, Wanli Ouyang, Jing Shao.
    - RePre: Improving Self-Supervised Vision Transformer with Reconstructive Pre-training. [[ArXiv'2022](https://arxiv.org/abs/2201.06857)]
+* **CMAE**: Zhicheng Huang, Xiaojie Jin, Chengze Lu, Qibin Hou, Ming-Ming Cheng, Dongmei Fu, Xiaohui Shen, Jiashi Feng.
+   - Contrastive Masked Autoencoders are Stronger Vision Learners. [[ArXiv'2022](https://arxiv.org/abs/2207.13532)]
 
 ### MIM for Transformer and CNN
 
@@ -103,8 +107,12 @@ The list of awesome MIM methods is summarized in chronological order and is on u
    - Masked Autoencoders As Spatiotemporal Learners. [[ArXiv'2022](https://arxiv.org/abs/2205.09113)]
 * **MaskViT**: Agrim Gupta, Stephen Tian, Yunzhi Zhang, Jiajun Wu, Roberto Martín-Martín, Li Fei-Fei.
    - MaskViT: Masked Visual Pre-Training for Video Prediction. [[ArXiv'2022](https://arxiv.org/abs/2206.11894)] [[code](https://github.com/agrimgupta92/maskvit)]
+* **OmniMAE**: Rohit Girdhar, Alaaeldin El-Nouby, Mannat Singh, Kalyan Vasudev Alwala, Armand Joulin, Ishan Misra.
+   - OmniMAE: Single Model Masked Pretraining on Images and Videos. [[ArXiv'2022](http://arxiv.org/abs/2206.08356)] [[code](https://github.com/facebookresearch/omnivore)]
 * **MILES**: Yuying Ge, Yixiao Ge, Xihui Liu, Alex Jinpeng Wang, Jianping Wu, Ying Shan, Xiaohu Qie, Ping Luo.
    - MILES: Visual BERT Pre-training with Injected Language Semantics for Video-text Retrieval. [[ArXiv'2022](https://arxiv.org/abs/2204.12408)] [[code](https://github.com/tencentarc/mcq)]
+* **MAR**: Zhiwu Qing, Shiwei Zhang, Ziyuan Huang, Xiang Wang, Yuehuan Wang, Yiliang Lv, Changxin Gao, Nong Sang.
+   - MAR: Masked Autoencoders for Efficient Action Recognition. [[ArXiv'2022](http://arxiv.org/abs/2207.11660)]
 
 ### Medical Image
 
@@ -119,6 +127,18 @@ The list of awesome MIM methods is summarized in chronological order and is on u
    - Pre-Training 3D Point Cloud Transformers with Masked Point Modeling. [[CVPR'2022](https://arxiv.org/abs/2111.14819)] [[code](https://github.com/lulutang0608/Point-BERT)]
 * **PointMAE**: Yatian Pang, Wenxiao Wang, Francis E.H. Tay, Wei Liu, Yonghong Tian, Li Yuan.
    - Masked Autoencoders for Point Cloud Self-supervised Learning. [[ECCV'2022](https://arxiv.org/abs/2203.06604)] [[code](https://github.com/Pang-Yatian/Point-MAE)]
+* **VoxelMAE**: Chen Min, Xinli Xu, Dawei Zhao, Liang Xiao, Yiming Nie, Bin Dai.
+   - Voxel-MAE: Masked Autoencoders for Pre-training Large-scale Point Clouds. [[ArXiv'2022](https://arxiv.org/abs/2206.09900)]
+
+### 3D Mesh Data
+
+* **MeshMAE**: Yaqian Liang, Shanshan Zhao, Baosheng Yu, Jing Zhang, Fazhi He.
+   - MeshMAE: Masked Autoencoders for 3D Mesh Data Analysis. [[ECCV'2022](http://arxiv.org/abs/2207.10228)]
+
+### Reinforcement Learning
+
+* **MLR**: Tao Yu, Zhizheng Zhang, Cuiling Lan, Yan Lu, Zhibo Chen.
+   - Mask-based Latent Reconstruction for Reinforcement Learning. [[ArXiv'2022](https://arxiv.org/abs/2201.12096)]
 
 
 ## Analysis of MIM
@@ -141,6 +161,10 @@ The list of awesome MIM methods is summarized in chronological order and is on u
    - On Data Scaling in Masked Image Modeling. [[ArXiv'2022](https://arxiv.org/abs/2206.04664)]
 * Jiachun Pan, Pan Zhou, Shuicheng Yan.
    - Towards Understanding Why Mask-Reconstruction Pretraining Helps in Downstream Tasks. [[ArXiv'2022](https://arxiv.org/abs/2206.03826)]
+* Gokul Karthik Kumar, Sahal Shaji Mullappilly, Abhishek Singh Gehlot.
+   - An Empirical Study Of Self-supervised Learning Approaches For Object Detection With Transformers. [[ArXiv'2022](https://arxiv.org/abs/2205.05543)] [[code](https://github.com/gokulkarthik/deformable-detr)]
+* Xiangwen Kong, Xiangyu Zhang.
+   - Understanding Masked Image Modeling via Learning Occlusion Invariant Feature. [[ArXiv'2022](http://arxiv.org/abs/2208.04164)]
 
 
 ## Contribution
diff --git a/openmixup/models/backbones/lan.py b/openmixup/models/backbones/lan.py
index cea13c66..bdcc3ad5 100644
--- a/openmixup/models/backbones/lan.py
+++ b/openmixup/models/backbones/lan.py
@@ -168,33 +168,30 @@ def __init__(self,
                  kernel_size=3,
                  act_cfg=dict(type='GELU'),
                  ffn_drop=0.,
-                 decompose_repeat=2,
-                 decompose_method='learn',
-                 decompose_reweight='learn',
-                 decompose_init_value=1e-2,
+                 decompose_method='after',
+                 decompose_init_value=0.,
                  decompose_act_cfg=None,
+                 decompose_post_conv=False,
                  init_cfg=None):
         super(DecomposeFFN, self).__init__(init_cfg=init_cfg)
 
         self.embed_dims = embed_dims
         self.feedforward_channels = feedforward_channels
-        self.decompose_repeat = decompose_repeat
-        self.decompose_channels = int(feedforward_channels / decompose_repeat)
-        assert self.feedforward_channels % self.decompose_channels == 0
         self.act_cfg = act_cfg
+        assert decompose_post_conv == False
 
         self.fc1 = Conv2d(
             in_channels=embed_dims,
-            out_channels=self.decompose_channels,
+            out_channels=self.feedforward_channels,
             kernel_size=1)
         self.dwconv = Conv2d(
-            in_channels=self.decompose_channels,
-            out_channels=self.decompose_channels,
+            in_channels=self.feedforward_channels,
+            out_channels=self.feedforward_channels,
             kernel_size=kernel_size,
             stride=1,
             padding=kernel_size // 2,
             bias=True,
-            groups=self.decompose_channels)
+            groups=self.feedforward_channels)
         self.act = build_activation_layer(act_cfg)
         self.fc2 = Conv2d(
             in_channels=feedforward_channels,
@@ -202,30 +199,36 @@ def __init__(self,
             kernel_size=1)
         self.drop = nn.Dropout(ffn_drop)
 
+        assert decompose_method in [None, 'between', 'between-shortcut', 'after',]
+        self.decompose_method = decompose_method
         self.decompose = Conv2d(
-            in_channels=self.decompose_channels,  # C -> 1
+            in_channels=self.feedforward_channels,  # C -> 1
             out_channels=1, kernel_size=1,
-        ) if decompose_method == 'learn' else None
-        for i in range(self.decompose_repeat):
-            sigma = ElementScale(
-                self.decompose_channels, decompose_init_value, decompose_reweight=='learn')
-            self.add_module(f'sigma{i + 1}', sigma)
-        self.decompose_act = custom_build_activation_layer(decompose_act_cfg)
+        ) if decompose_method is not None else nn.Identity()
+        self.sigma = ElementScale(
+            self.feedforward_channels, decompose_init_value, requires_grad=True)
+        self.decompose_act = custom_build_activation_layer(decompose_act_cfg) \
+            if decompose_method is not None else nn.Identity()
+
+    def feat_decompose(self, x, shortcut=None):
+        x_d = shortcut if shortcut is not None else x
+        x_d = self.decompose_act(self.decompose(x))  # [B, C, H, W] -> [B, 1, H, W]
+        x = x + self.sigma(x - x_d)
+        return x
 
     def forward(self, x):
         # proj 1
-        x = self.act(self.dwconv(self.fc1(x)))
-        x = self.drop(x)
-        # decompose
-        if self.decompose is not None:
-            x_d = self.decompose_act(self.decompose(x))  # [B, C, H, W] -> [B, 1, H, W]
+        x = self.fc1(x)
+        if self.decompose_method == 'between-shortcut':
+            x = self.feat_decompose(self.dwconv(x), shortcut=x)
         else:
-            x_d = torch.mean(x, dim=1, keepdim=True)  # [B, 1, H, W]
-        x_repeat = list()
-        for i in range(self.decompose_repeat):
-            sigma_i = getattr(self, f'sigma{i + 1}')
-            x_repeat.append(x + sigma_i(x - x_d))
-        x = torch.cat(x_repeat, dim=1)
+            x = self.dwconv(x)
+        if self.decompose_method == 'between':
+            x = self.feat_decompose(x)
+        x = self.act(x)
+        x = self.drop(x)
+        if self.decompose_method == 'after':
+            x = self.feat_decompose(x)
         # proj 2
         x = self.fc2(x)
         x = self.drop(x)
@@ -402,6 +405,7 @@ def __init__(self,
         self.embed_dims_0 = embed_dims - self.embed_dims_1 - self.embed_dims_2
         self.embed_dims = embed_dims
 
+        assert with_dilation == True and with_pointwise == True
         assert dw_kernel_size % 2 == 1 and dw_kernel_size >= 3
         # basic DW conv
         self.DW_conv0 = Conv2d(
@@ -414,11 +418,11 @@ def __init__(self,
         self.DW_conv1 = Conv2d(
             in_channels=self.embed_dims_1,
             out_channels=self.embed_dims_1,
-            kernel_size=5,
+            kernel_size=5 if dw_kernel_size != 7 else 7,
             stride=1,
-            padding=4 if with_dilation else 5 // 2,
+            padding=4 if dw_kernel_size != 7 else 6,
             groups=self.embed_dims_1,
-            dilation=2 if with_dilation else 1,
+            dilation=2,
         )
         # DW conv 2
         self.DW_conv2 = Conv2d(
@@ -426,15 +430,15 @@ def __init__(self,
             out_channels=self.embed_dims_2,
             kernel_size=7,
             stride=1,
-            padding=9 if with_dilation else 7 // 2,
+            padding=9,
             groups=self.embed_dims_2,
-            dilation=3 if with_dilation else 1,
+            dilation=3,
         )
         # a channel convolution
         self.PW_conv = Conv2d(  # point-wise convolution
             in_channels=embed_dims,
             out_channels=embed_dims,
-            kernel_size=1) if with_pointwise else nn.Identity()
+            kernel_size=1)
 
     def forward(self, x):
         x_0 = self.DW_conv0(x)
@@ -465,7 +469,6 @@ def __init__(self,
                  act_gate_kernel=dict(type="SiLU"),
                  with_dilation=True,
                  with_pointwise=True,
-                 with_glu_dw_conv=False,
                  with_channel_shuffle=False,
                  init_cfg=None):
         super(GAUAttention, self).__init__(init_cfg=init_cfg)
@@ -476,12 +479,6 @@ def __init__(self,
             in_channels=embed_dims, out_channels=embed_dims, kernel_size=1)
         self.proj_g = Conv2d(
             in_channels=embed_dims, out_channels=embed_dims, kernel_size=1)
-        self.conv_g = nn.Conv2d(
-            in_channels=embed_dims,
-            out_channels=embed_dims,
-            kernel_size=dw_kernel_size,
-            padding=dw_kernel_size // 2,
-            groups=embed_dims) if with_glu_dw_conv else nn.Identity()
         # value
         self.large_kernel_unit = LKGAU(
             embed_dims, dw_kernel_size,
@@ -497,18 +494,13 @@ def __init__(self,
 
     def forward(self, x):
         shorcut = x.clone()
-        x = self.proj_1(x)
-        x = self.act_value(x)
+        x = self.act_value(self.proj_1(x))
 
-        # value
+        # gating * value
         v = self.large_kernel_unit(x)
-        v = self.act_value(v)
-        # gating
-        g = self.conv_g(x)
-        g = self.proj_g(g)
-        g = self.act_gate(g)
+        g = self.proj_g(x)
+        x = self.act_gate(g) * self.act_value(v)
 
-        x = g * v
         if self.with_channel_shuffle:
             x = channel_shuffle(x)
         x = self.proj_2(x)
@@ -535,25 +527,19 @@ def __init__(self,
                  with_channel_split=[2, 1, 1],
                  with_dilation=True,
                  with_pointwise=True,
-                 with_glu_dw_conv=False,
                  with_channel_shuffle=False,
+                 decompose_method=None,
+                 decompose_position='before',
                  init_cfg=None):
         super(InceptionGAUAttention, self).__init__(init_cfg=init_cfg)
 
         self.embed_dims = embed_dims
-        self.with_channel_shuffle = with_channel_shuffle
         self.proj_1 = Conv2d(
             in_channels=embed_dims, out_channels=embed_dims, kernel_size=1)
-        self.proj_g = Conv2d(
+        self.gate = Conv2d(
             in_channels=embed_dims, out_channels=embed_dims, kernel_size=1)
-        self.conv_g = nn.Conv2d(
-            in_channels=embed_dims,
-            out_channels=embed_dims,
-            kernel_size=dw_kernel_size,
-            padding=dw_kernel_size // 2,
-            groups=embed_dims) if with_glu_dw_conv else nn.Identity()
         # value
-        self.large_kernel_unit = InceptionGAU(
+        self.value = InceptionGAU(
             embed_dims, dw_kernel_size,
             with_channel_split=with_channel_split,
             with_dilation=with_dilation,
@@ -563,29 +549,45 @@ def __init__(self,
             in_channels=embed_dims, out_channels=embed_dims, kernel_size=1)
         self.channel_split_group = sum(with_channel_split)
         assert embed_dims % self.channel_split_group == 0
+        assert with_channel_shuffle == False
 
         # activation for gating and value
         self.act_value = custom_build_activation_layer(act_value_kernel)
         self.act_gate = custom_build_activation_layer(act_gate_kernel)
 
+        # decompose
+        self.decompose_position = decompose_position if decompose_method is not None else 'none'
+        assert decompose_method in [None, 'pool',]
+        assert decompose_position in ['before', 'between', 'between-shortcut', 'after',]
+        if decompose_method is not None:
+            self.sigma = ElementScale(embed_dims, 0., requires_grad=True)
+        else:
+            self.sigma = None
+
+    def feat_decompose(self, x, shortcut=None):
+        x_d = shortcut if shortcut is not None else x
+        x_d = F.adaptive_avg_pool2d(x_d, output_size=1)  # [B, C, 1, 1]
+        x = x + self.sigma(x - x_d)
+        return x
+
     def forward(self, x):
-        shorcut = x.clone()
+        shortcut = x.clone()
+
+        if self.decompose_position == 'before':
+            x = self.feat_decompose(x)
         x = self.proj_1(x)
+        if self.decompose_position == 'between':
+            x = self.feat_decompose(x)
+        if self.decompose_position == 'between-shortcut':
+            x = self.feat_decompose(x, shortcut=shortcut)
         x = self.act_value(x)
+        if self.decompose_position == 'after':
+            x = self.feat_decompose(x)
 
-        # value
-        v = self.large_kernel_unit(x)
-        v = self.act_value(v)
-        # gating
-        g = self.conv_g(x)
-        g = self.proj_g(g)
-        g = self.act_gate(g)
-
-        x = g * v
-        if self.with_channel_shuffle:
-            x = channel_shuffle(x, groups=self.channel_split_group)
+        # gating * value
+        x = self.act_gate(self.gate(x)) * self.act_value(self.value(x))
         x = self.proj_2(x)
-        x = x + shorcut
+        x = x + shortcut
         return x
 
 
@@ -625,14 +627,14 @@ def __init__(self,
                  attn_dw_kernel_size=5,
                  attn_with_dilation=True,
                  attn_with_pointwise=True,
-                 attn_with_glu_dw_conv=False,
                  attn_with_channel_shuffle=False,
+                 attn_decompose_method=None,
+                 attn_decompose_position='before',
                  ffn_dwconv_kernel_size=3,
-                 ffn_decompose_repeat=1,
-                 ffn_decompose_method='mean',
-                 ffn_decompose_reweight='fix',
+                 ffn_decompose_method='after',
                  ffn_decompose_init_value=1,
                  ffn_decompose_act_cfg=None,
+                 ffn_decompose_post_conv=False,
                  init_cfg=None):
         super(VANBlock, self).__init__(init_cfg=init_cfg)
         self.out_channels = embed_dims
@@ -648,7 +650,6 @@ def __init__(self,
                 act_gate_kernel=attn_act_gate_cfg,
                 with_dilation=attn_with_dilation,
                 with_pointwise=attn_with_pointwise,
-                with_glu_dw_conv=attn_with_glu_dw_conv,
                 with_channel_shuffle=attn_with_channel_shuffle,
             )
         elif attention_types == "InceptionGAU":
@@ -660,8 +661,9 @@ def __init__(self,
                 with_channel_split=with_channel_split,
                 with_dilation=attn_with_dilation,
                 with_pointwise=attn_with_pointwise,
-                with_glu_dw_conv=attn_with_glu_dw_conv,
                 with_channel_shuffle=attn_with_channel_shuffle,
+                decompose_method=attn_decompose_method,
+                decompose_position=attn_decompose_position,
             )
         else:
             self.attn = VANAttention(
@@ -691,11 +693,10 @@ def __init__(self,
                 act_cfg=act_cfg,
                 kernel_size=ffn_dwconv_kernel_size,
                 ffn_drop=drop_rate,
-                decompose_repeat=ffn_decompose_repeat,
                 decompose_method=ffn_decompose_method,
-                decompose_reweight=ffn_decompose_reweight,
                 decompose_init_value=ffn_decompose_init_value,
                 decompose_act_cfg=ffn_decompose_act_cfg,
+                decompose_post_conv=ffn_decompose_post_conv,
             )
         else:
             self.mlp = ConvFFN(  # vanilla FFN
@@ -807,7 +808,7 @@ def forward(self, x):
 @BACKBONES.register_module()
 class LAN(BaseBackbone):
     """Linear Attention Network based on Visual Attention Network.
-        v08.11, IP53
+        v08.17, IP53
 
     Args:
         arch (str | dict): Visual Attention Network architecture.
@@ -894,13 +895,13 @@ def __init__(self,
                  attn_with_dilation=True,
                  attn_with_pointwise=True,
                  attn_with_channel_shuffle=False,
-                 attn_with_glu_dw_conv=False,
+                 attn_decompose_method=None,
+                 attn_decompose_position='before',
                  ffn_dwconv_kernel_size=3,
-                 ffn_decompose_repeat=1,
-                 ffn_decompose_method='mean',
-                 ffn_decompose_reweight='fix',
-                 ffn_decompose_init_value=1,
+                 ffn_decompose_method='after',
+                 ffn_decompose_init_value=0,
                  ffn_decompose_act_cfg=None,
+                 ffn_decompose_post_conv=False,
                  block_cfgs=dict(),
                  init_cfg=None):
         super(LAN, self).__init__(init_cfg=init_cfg)
@@ -975,13 +976,13 @@ def __init__(self,
                     attn_with_dilation=attn_with_dilation,
                     attn_with_pointwise=attn_with_pointwise,
                     attn_with_channel_shuffle=attn_with_channel_shuffle,
-                    attn_with_glu_dw_conv=attn_with_glu_dw_conv,
+                    attn_decompose_method=attn_decompose_method,
+                    attn_decompose_position=attn_decompose_position,
                     ffn_dwconv_kernel_size=ffn_dwconv_kernel_size,
-                    ffn_decompose_repeat=ffn_decompose_repeat,
                     ffn_decompose_method=ffn_decompose_method,
-                    ffn_decompose_reweight=ffn_decompose_reweight,
                     ffn_decompose_init_value=ffn_decompose_init_value,
                     ffn_decompose_act_cfg=ffn_decompose_act_cfg,
+                    ffn_decompose_post_conv=ffn_decompose_post_conv,
                     **block_cfgs) for j in range(depth)
             ])
             cur_block_idx += depth
diff --git a/openmixup/models/heads/pmix_block.py b/openmixup/models/heads/pmix_block.py
index f8b72416..0531f157 100644
--- a/openmixup/models/heads/pmix_block.py
+++ b/openmixup/models/heads/pmix_block.py
@@ -3,13 +3,11 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from mmcv.cnn import build_norm_layer, constant_init, kaiming_init, normal_init, \
-    ConvModule, NonLocal2d
+from mmcv.cnn import ConvModule, constant_init, kaiming_init, normal_init
 from mmcv.runner import BaseModule, force_fp32
 
 from openmixup.utils import print_log
 from ..registry import HEADS
-from ..necks import ConvNeck
 from .. import builder
 
 
@@ -32,36 +30,15 @@ class PixelMixBlock(BaseModule):
             when the mode is `embedded_gaussian`. Default: True.
         unsampling_mode (str or list): Unsampling mode {'nearest', 'bilinear', etc}. Build a
             list for various upsampling mode. Default: 'nearest'.
-        pre_norm_cfg (dict): Config dict for a norm before q,k,v input of MixBlock.
-            e.g., pre_norm_cfg=dict(type='BN', requires_grad=True).
-            Default: None.
-        pre_conv_cfg (dict): Config dict for a before MixBlock convolution neck.
-            e.g., pre_conv_cfg=dict(
-                type="ConvNeck", in_channels=256, hid_channels=128, out_channels=256,
-                num_layers=2, kernel_size=3, with_bias=True, with_residual=True).
-            Default: None.
-        pre_attn_cfg (dict): Config dict for a before MixBlock self-attention block.
-            e.g., pre_attn_cfg=dict(in_channels=256, mode='gaussian').
-            Default: None.
-        pre_neck_cfg (dict): Config dict for a Neck parallel to MixBlock, which converts
-            feature maps to flattened vectors for the pre_head (directly supervised by loss).
-                E.g., pre_neck_cfg=dict(
-                    type='LinearNeck', in_channels=256, out_channels=128, with_avg_pool=True)
-            Default: None.
-        pre_head_cfg (dict): Config dict for a loss head parallel to MixBlock, e.g., infoNCE
-            or classification CE loss, which is used to train pre_conv and pre_attn.
-            Default: None.
         lam_concat (bool): Whether to concat lam as a channel in all input q, k, v.
             Default: False. (lam_concat=False if lam_concat_v=True)
         lam_concat_v (bool): Whether to concat lam as a channel in v but not in q, k.
             Default: False. (lam_concat_v=False if lam_concat=True)
         lam_mul (bool or float): Whether to mult lam in x_lam and mult (1-lam) in x_lam_
-            to get pair-wise weight.
-            Default: False.
+            to get pair-wise weight. Default: False.
         lam_mul_k (float or list): Rescale lambda before multipling to x, which is adjusted
             by k. Build a list for various adjusting k. Default: -1.
-        lam_residual (bool): Whether to use residual addition for lam_mult.
-            Default: False.
+        lam_residual (bool): Whether to use residual addition for lam_mult. Default: False.
         value_neck_cfg (dict): Config dict for a non-linear value embedding network.
             E.g., value_neck_cfg=dict(
                 type="ConvNeck", in_channels=256, hid_channels=128, out_channels=1,
@@ -77,7 +54,6 @@ class PixelMixBlock(BaseModule):
         mask_loss_mode (str): Loss mode in {"none", "L2", "L1", "Variance", "L1+Variance",
             "L2+Variance", "Sparsity"} to caculate loss. Default: "none".
         mask_loss_margin (int): Margine loss for the grid mask pattens. Default: 0.
-        mask_mode (str): Which mode to normalize mixup masks to sum=1. Default: "none".
     """
 
     def __init__(self,
@@ -85,11 +61,6 @@ def __init__(self,
             reduction=2,
             use_scale=True,
             unsampling_mode='bilinear',
-            pre_norm_cfg=None,
-            pre_conv_cfg=None,
-            pre_attn_cfg=None,
-            pre_neck_cfg=None,
-            pre_head_cfg=None,
             lam_concat=False,
             lam_concat_v=False,
             lam_mul=0.,
@@ -100,9 +71,8 @@ def __init__(self,
             x_v_concat=False,
             att_norm_cfg=None,
             att_act_cfg=None,
-            mask_loss_mode="none",
+            mask_loss_mode="L1",
             mask_loss_margin=0,
-            mask_mode="none",
             frozen=False,
             init_cfg=None,
             **kwargs):
@@ -117,28 +87,6 @@ def __init__(self,
         for m in self.unsampling_mode:
             assert m in ['nearest', 'bilinear', 'bicubic',]
 
-        # pre MixBlock or parallel to MixBlock
-        assert pre_norm_cfg is None or isinstance(pre_norm_cfg, dict)
-        assert pre_conv_cfg is None or isinstance(pre_conv_cfg, dict)
-        assert pre_attn_cfg is None or isinstance(pre_attn_cfg, dict)
-        assert pre_neck_cfg is None or isinstance(pre_neck_cfg, dict)
-        assert pre_head_cfg is None or isinstance(pre_head_cfg, dict)
-        self.pre_norm = pre_norm_cfg
-        self.pre_conv = pre_conv_cfg
-        self.pre_attn = pre_attn_cfg
-        self.pre_neck = pre_neck_cfg
-        self.pre_head = pre_head_cfg
-        if pre_norm_cfg is not None:
-            _, self.pre_norm = build_norm_layer(pre_norm_cfg, in_channels)
-        if pre_conv_cfg is not None:
-            self.pre_conv = ConvNeck(**pre_conv_cfg)
-        if pre_attn_cfg is not None:
-            self.pre_attn = NonLocal2d(**pre_attn_cfg)
-        if pre_neck_cfg is not None:
-            self.pre_neck = builder.build_neck(pre_neck_cfg)
-        if pre_head_cfg is not None:
-            self.pre_head = builder.build_head(pre_head_cfg)
-
         # mixblock args
         self.lam_concat = bool(lam_concat)
         self.lam_concat_v = bool(lam_concat_v)
@@ -153,15 +101,11 @@ def __init__(self,
         self.x_v_concat = bool(x_v_concat)
         self.mask_loss_mode = str(mask_loss_mode)
         self.mask_loss_margin = max(mask_loss_margin, 0.)
-        self.mask_mode = str(mask_mode)
         self.frozen = bool(frozen)
         assert 0 <= lam_mul and lam_mul <= 1
         for i in range(len(self.lam_mul_k)):
             self.lam_mul_k[i] = min(self.lam_mul_k[i], 10) if self.lam_mul_k[i] >= 0 else -1
-        assert mask_loss_mode in [
-            "none", "L2", "L1", "Variance", "L1+Variance", "L2+Variance", "Sparsity"]
-        assert mask_mode in [
-            "none", "none_v_", "sum", "softmax"]
+        assert mask_loss_mode in ["L1", "L1+Variance", "L2+Variance", "Sparsity"]
         if self.lam_concat or self.lam_concat_v:
             assert self.lam_concat != self.lam_concat_v, \
                 "lam_concat can be adopted on q,k,v or only on v"
@@ -187,8 +131,8 @@ def __init__(self,
         # MixBlock, conv value
         if value_neck_cfg is None:
             self.value = nn.Conv2d(
-                self.v_in_channels,
-                1,
+                in_channels=self.v_in_channels,
+                out_channels=1,
                 kernel_size=1,
                 stride=1)
         else:
@@ -202,7 +146,7 @@ def __init__(self,
                 in_channels=self.qk_in_channels,
                 out_channels=self.inter_channels,
                 kernel_size=1, stride=1, padding=0,
-                groups=1, bias='auto', 
+                groups=1, bias='auto',
                 norm_cfg=att_norm_cfg,
                 act_cfg=att_act_cfg,
             )
@@ -238,22 +182,6 @@ def init_weights(self, init_linear='normal', std=0.01, bias=0.):
 
     def _freeze(self):
         if self.frozen:
-            # before mixblock
-            if self.pre_norm is not None:
-                for param in self.pre_norm.parameters():
-                    param.requires_grad = False
-            if self.pre_conv is not None:
-                for param in self.pre_conv.parameters():
-                    param.requires_grad = False
-            if self.pre_attn is not None:
-                for param in self.pre_attn.parameters():
-                    param.requires_grad = False
-            if self.pre_neck is not None:
-                for param in self.pre_neck.parameters():
-                    param.requires_grad = False
-            if self.pre_head is not None:
-                for param in self.pre_head.parameters():
-                    param.requires_grad = False
             # mixblock
             for param in self.query.parameters():
                 param.requires_grad = False
@@ -287,7 +215,7 @@ def embedded_gaussian(self, q_x, k_x):
                 raise ValueError("Precision overflow in MixBlock, try fp32 training.")
         if self.use_scale:
             # q_x.shape[-1] is `self.inter_channels`
-            pairwise_weight /= q_x.shape[-1]**0.5
+            pairwise_weight /= q_x.shape[-1] ** 0.5
         # force fp32 in exp
         pairwise_weight = pairwise_weight.type(torch.float32).softmax(dim=-1)
         return pairwise_weight
@@ -296,14 +224,12 @@ def rescale_lam_mult(self, lam, k=1):
         """ adjust lam against y=x in terms of k """
         assert k >= 0
         k += 1
-        if not isinstance(lam, float):
-            lam = float(lam)
+        lam = float(lam)
         return 1 / (k - 2/3) * (4/3 * math.pow(lam, 3) -2 * lam**2 + k * lam)
 
     def forward(self, x, lam, index, scale_factor, debug=False, unsampling_override=None):
-        """ v08.23, add pre_conv and pre_attn
-            v01.07, add override upsampling
-
+        """ 
+        Args:
             x (tensor): Input feature map [N, C, H, W].
             lam (int): Mixup ratio lambda.
             index (tensor): Random shuffle index in current mini-batch.
@@ -317,13 +243,6 @@ def forward(self, x, lam, index, scale_factor, debug=False, unsampling_override=
             assert len(x) == 2  # only for SSL mixup
             x = torch.cat(x)
         n, _, h, w = x.size()
-        # pre-step 1: before mixblock, add pre conv and attn
-        if self.pre_attn is not None:
-            x = self.pre_attn(x)
-        if self.pre_conv is not None:
-            x = self.pre_conv([x])[0]
-        if self.pre_norm is not None:
-            x = self.pre_norm(x)
 
         if index is None:  # only for SSL mixup, [2N, C, H, W]
             n = n // 2
@@ -334,7 +253,7 @@ def forward(self, x, lam, index, scale_factor, debug=False, unsampling_override=
             x_lam_ = x[index, :]  # shuffle within a gpu
         results = dict(x_lam=x_lam, x_lam_=x_lam_)
 
-        # pre-step 2: lambda encoding
+        # pre-step 1: lambda encoding
         if self.lam_mul > 0:  # multiply lam to x_lam
             assert self.lam_concat == False
             # rescale lam
@@ -352,32 +271,26 @@ def forward(self, x, lam, index, scale_factor, debug=False, unsampling_override=
                 x_lam_ = x_lam_ * (1 - lam_rescale)
         if self.lam_concat:  # concat lam as a new channel
             # assert self.lam_mul > 0 and self.x_qk_concat == False
-            lam_block = torch.zeros(n, 1, h, w).cuda().type_as(x_lam)
+            lam_block = torch.zeros(n, 1, h, w).to(x_lam)
             lam_block[:] = lam
             x_lam  = torch.cat([x_lam, lam_block], dim=1)
             x_lam_ = torch.cat([x_lam_, 1-lam_block], dim=1)
 
         # **** step 1: conpute 1x1 conv value, v: [N, HxW, 1] ****
-        v, v_ = x_lam, x_lam_
+        v_ = x_lam_
         if self.x_v_concat:
-            v  = torch.cat([x_lam, x_lam_], dim=1)
-            v_ = v
+            v_ = torch.cat([x_lam, x_lam_], dim=1)
         if self.lam_concat_v:
-            lam_block = torch.zeros(n, 1, h, w).cuda().type_as(x_lam)
+            lam_block = torch.zeros(n, 1, h, w).to(x_lam)
             lam_block[:] = lam
-            v  = torch.cat([x_lam, lam_block], dim=1)
             v_ = torch.cat([x_lam_, 1-lam_block], dim=1)
-        if self.mask_mode != "none":  # compute both v and v_
-            if self.value_neck_cfg is None:
-                v_ = self.value(v_).view(n, 1, -1)  # [N, 1, HxW]
-            else:
-                v_ = self.value([v_])[0].view(n, 1, -1)  # [N, 1, HxW]
-            v_ = v_.permute(0, 2, 1)  # v_ for 1-lam: [N, HxW, 1]
+        # compute v_
         if self.value_neck_cfg is None:
-            v = self.value(v).view(n, 1, -1)  # [N, 1, HxW]
+            v_ = self.value(v_).view(n, 1, -1)  # [N, 1, HxW]
         else:
-            v = self.value([v])[0].view(n, 1, -1)  # [N, 1, HxW]
-        v = v.permute(0, 2, 1)  # v for lam: [N, HxW, 1]
+            v_ = self.value([v_])[0].view(n, 1, -1)  # [N, 1, HxW]
+        v_ = v_.permute(0, 2, 1)  # v_ for 1-lam: [N, HxW, 1]
+
         # debug mode
         if debug:
             debug_plot = dict(value=v_.view(n, h, -1).clone().detach())
@@ -402,6 +315,7 @@ def forward(self, x, lam, index, scale_factor, debug=False, unsampling_override=
         if debug:
             debug_plot["pairwise_weight"] = pairwise_weight.clone().detach()
             results["debug_plot"] = debug_plot
+
         # choose upsampling mode
         if unsampling_override is not None:
             if isinstance(unsampling_override, str):
@@ -416,54 +330,23 @@ def forward(self, x, lam, index, scale_factor, debug=False, unsampling_override=
             up_mode = random.choices(self.unsampling_mode, k=1)[0]
 
         # **** step 4: generate mixup mask and upsampling ****
-        if self.mask_mode in ["none", "sum", "softmax"]:
-            # P^T x v_lam = mask_lam, force fp32 in matmul (causing NAN in fp16)
-            mask_lam = torch.matmul(
-                pairwise_weight.permute(0, 2, 1).type(torch.float32), v.type(torch.float32)
-            ).view(n, 1, h, w)  # mask for lam
-            if torch.any(torch.isnan(mask_lam)):
-                print_log("Warming mask_lam is nan, P: {}, v: {}, remove nan.".format(
-                    pairwise_weight, v), logger='root')
-                mask_lam = torch.matmul(  # P^T x v_lam = mask_lam
-                    pairwise_weight.permute(0, 2, 1).type(torch.float64), v.type(torch.float64)
-                ).view(n, 1, h, w)
-                mask_lam = torch.where(torch.isnan(mask_lam),
-                                       torch.full_like(mask_lam, 1e-4), mask_lam)
-            mask_lam = F.interpolate(mask_lam, scale_factor=scale_factor, mode=up_mode)
-            # mask for lam in [0, 1], force fp32 in exp
-            mask_lam = torch.sigmoid(mask_lam.type(torch.float32))
-        if self.mask_mode in ["none_v_", "sum", "softmax"]:
-            # P x v_lam_ = mask_lam_, force fp32 in matmul (causing NAN in fp16)
+        # P x v_lam_ = mask_lam_, force fp32 in matmul (causing NAN in fp16)
+        mask_lam_ = torch.matmul(
+            pairwise_weight.type(torch.float32), v_.type(torch.float32)
+        ).view(n, 1, h, w)  # mask for 1-lam
+        if torch.any(torch.isnan(mask_lam_)):
+            print_log("Warming mask_lam_ is nan, P: {}, v: {}, remove nan.".format(
+                pairwise_weight, v_), logger='root')
             mask_lam_ = torch.matmul(
-                pairwise_weight.type(torch.float32), v_.type(torch.float32)
-            ).view(n, 1, h, w)  # mask for 1-lam
-            if torch.any(torch.isnan(mask_lam_)):
-                print_log("Warming mask_lam_ is nan, P: {}, v: {}, remove nan.".format(
-                    pairwise_weight, v_), logger='root')
-                mask_lam_ = torch.matmul(
-                    pairwise_weight.type(torch.float64), v_.type(torch.float64)
-                ).view(n, 1, h, w)
-                mask_lam_ = torch.where(torch.isnan(mask_lam_),
-                                        torch.full_like(mask_lam_, 1e-4), mask_lam_)
-            mask_lam_ = F.interpolate(mask_lam_, scale_factor=scale_factor, mode=up_mode)
-            # mask for 1-lam in [0, 1], force fp32 in exp (causing NAN in fp16)
-            mask_lam_ = torch.sigmoid(mask_lam_.type(torch.float32))
+                pairwise_weight.type(torch.float64), v_.type(torch.float64)
+            ).view(n, 1, h, w)
+            mask_lam_ = torch.where(torch.isnan(mask_lam_),
+                                    torch.full_like(mask_lam_, 1e-4), mask_lam_)
+        mask_lam_ = F.interpolate(mask_lam_, scale_factor=scale_factor, mode=up_mode)
+        # mask for 1-lam in [0, 1], force fp32 in exp (causing NAN in fp16)
+        mask_lam_ = torch.sigmoid(mask_lam_.type(torch.float32))
 
-        if self.mask_mode == "none":
-            mask = torch.cat([mask_lam, 1 - mask_lam], dim=1)
-        elif self.mask_mode == "none_v_":
-            mask = torch.cat([1 - mask_lam_, mask_lam_], dim=1)
-        elif self.mask_mode == "sum":
-            # stop grad of one side [try]
-            mask = torch.cat([mask_lam.clone().detach(), mask_lam_], dim=1)
-            sum_masks = mask.sum(1, keepdim=True)  # sum to 1
-            mask /= sum_masks
-        elif self.mask_mode == "softmax":
-            # stop grad of one side [try]
-            mask = torch.cat([mask_lam.clone().detach(), mask_lam_], dim=1)
-            mask = mask.softmax(dim=1)  # sum to 1 by softmax
-        else:
-            raise NotImplementedError
+        mask = torch.cat([1 - mask_lam_, mask_lam_], dim=1)
 
         results["mask"] = mask
         return results
@@ -480,12 +363,6 @@ def mask_loss(self, mask, lam):
         if self.mask_loss_mode == "L1":  # [0, 1-m]
             losses['loss'] = torch.clamp(
                 torch.abs(1 - m_mean - lam) - self.mask_loss_margin, min=0.).mean()
-        elif self.mask_loss_mode == "L2":  # [0, 1-m^2]
-            losses['loss'] = torch.clamp(
-                (1 - m_mean - lam) ** 2 - self.mask_loss_margin ** 2, min=0.).mean()
-        elif self.mask_loss_mode == "Variance":  # [0, 0.5]
-            losses['loss'] = -torch.clamp(
-                (torch.sum((mask - m_mean)**2) / (n * h * w)), min=0.)
         elif self.mask_loss_mode == "Sparsity":  # [0, 0.25-m]
             losses['loss'] = torch.clamp(
                 torch.abs(mask * (mask - 1)).sum() / (n * h * w) - self.mask_loss_margin, min=0.)