From 01fde43e51001defc67bc6cc4e4cc9ecb78c59d3 Mon Sep 17 00:00:00 2001
From: ayeffkay <chatton1@mail.ru>
Date: Tue, 13 Jul 2021 18:16:26 +0300
Subject: [PATCH 01/17] DistilBERT links and description added

---
 docs/features/models/bert.rst | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/docs/features/models/bert.rst b/docs/features/models/bert.rst
index 564cd31d73..5404a4d433 100644
--- a/docs/features/models/bert.rst
+++ b/docs/features/models/bert.rst
@@ -29,6 +29,8 @@ We have trained BERT-base model for other languages and domains:
    `[deeppavlov_pytorch] <http://files.deeppavlov.ai/deeppavlov_data/bert/conversational_cased_L-12_H-768_A-12_pt.tar.gz>`__
 -  Conversational RuBERT, Russian, cased, 12-layer, 768-hidden, 12-heads, 180M parameters: `[deeppavlov] <http://files.deeppavlov.ai/deeppavlov_data/bert/ru_conversational_cased_L-12_H-768_A-12.tar.gz>`__,
    `[deeppavlov_pytorch] <http://files.deeppavlov.ai/deeppavlov_data/bert/ru_conversational_cased_L-12_H-768_A-12_pt.tar.gz>`__
+-  Conversational DistilRuBERT, Russian, cased, 6-layer, 768-hidden, 12-heads, 135.4M parameters: `[deeppavlov_pytorch] <http://files.deeppavlov.ai/deeppavlov_data/bert/distil_ru_conversational_cased_L-6_H-768_A-12_pt.tar.gz>`__
+-  Conversational DistilRuBERT-tiny, Russian, cased, 2-layer, 768-hidden, 12-heads, 107M parameters: `[deeppavlov_pytorch] <http://files.deeppavlov.ai/deeppavlov_data/bert/distil_ru_conversational_cased_L-2_H-768_A-12_pt.tar.gz>`__
 -  Sentence Multilingual BERT, 101 languages, cased, 12-layer, 768-hidden, 12-heads, 180M parameters: `[deeppavlov] <http://files.deeppavlov.ai/deeppavlov_data/bert/sentence_multi_cased_L-12_H-768_A-12.tar.gz>`__,
    `[deeppavlov_pytorch] <http://files.deeppavlov.ai/deeppavlov_data/bert/sentence_multi_cased_L-12_H-768_A-12_pt.tar.gz>`__
 -  Sentence RuBERT, Russian, cased, 12-layer, 768-hidden, 12-heads, 180M parameters: `[deeppavlov] <http://files.deeppavlov.ai/deeppavlov_data/bert/sentence_ru_cased_L-12_H-768_A-12.tar.gz>`__,
@@ -50,6 +52,13 @@ English cased version of BERT-base as initialization for English Conversational
 Conversational RuBERT was trained on OpenSubtitles [5]_, Dirty, Pikabu, and Social Media segment of Taiga corpus [8]_.
 We assembled new vocabulary for Conversational RuBERT model on this data and initialized model with RuBERT.
 
+Conversational DistilRuBERT (6 transformer layers) and DistilRuBERT-tiny (2 transformer layers) were trained on the same data as Conversational RuBERT and highly inspired by DistilBERT [13]_. Namely, Distil* models (students) used pretrained Conversational RuBERT as teacher and linear combination of the following losses:
+
+1. Masked language modeling loss (between student output logits for tokens and its true labels)
+2. Kullback-Leibler divergence (between student and teacher output logits)
+3. Cosine embedding loss (between averaged hidden states of the teacher and hidden states of the student)
+4. Mean squared error loss (between averaged attention maps of the teacher and attention maps of the student)
+
 Sentence Multilingual BERT is a representation-based sentence encoder for 101 languages of Multilingual BERT.
 It is initialized with Multilingual BERT and then fine-tuned on english MultiNLI [9]_ and on dev set of multilingual XNLI [10]_.
 Sentence representations are mean pooled token embeddings in the same manner as in Sentence-BERT [12]_.
@@ -196,3 +205,4 @@ the :doc:`config </intro/configuration>` file must be changed to match new BERT
 .. [10] Williams A., Bowman S. (2018) XNLI: Evaluating Cross-lingual Sentence Representations. arXiv preprint arXiv:1809.05053
 .. [11] S. R. Bowman, G. Angeli, C. Potts, and C. D. Manning. (2015) A large annotated corpus for learning natural language inference. arXiv preprint arXiv:1508.05326
 .. [12] N. Reimers, I. Gurevych (2019) Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks. arXiv preprint arXiv:1908.10084
+.. [13] Sanh, V., Debut, L., Chaumond, J., & Wolf, T. (2019). DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter. arXiv preprint arXiv:1910.01108.

From b003a53632a6e7b8ed6fc19f6ad13e11e4f320e2 Mon Sep 17 00:00:00 2001
From: ayeffkay <chatton1@mail.ru>
Date: Tue, 13 Jul 2021 18:23:12 +0300
Subject: [PATCH 02/17] Distil configs for paraphraser

---
 ...phraser_convers_distilrubert_2L_torch.json | 85 +++++++++++++++++++
 ...phraser_convers_distilrubert_6L_torch.json | 85 +++++++++++++++++++
 2 files changed, 170 insertions(+)
 create mode 100644 deeppavlov/configs/classifiers/paraphraser_convers_distilrubert_2L_torch.json
 create mode 100644 deeppavlov/configs/classifiers/paraphraser_convers_distilrubert_6L_torch.json

diff --git a/deeppavlov/configs/classifiers/paraphraser_convers_distilrubert_2L_torch.json b/deeppavlov/configs/classifiers/paraphraser_convers_distilrubert_2L_torch.json
new file mode 100644
index 0000000000..4e51808e89
--- /dev/null
+++ b/deeppavlov/configs/classifiers/paraphraser_convers_distilrubert_2L_torch.json
@@ -0,0 +1,85 @@
+ {
+  "dataset_reader": {
+    "class_name": "paraphraser_reader",
+    "data_path": "{DOWNLOADS_PATH}/paraphraser_data",
+    "do_lower_case": false
+  },
+  "dataset_iterator": {
+    "class_name": "siamese_iterator",
+    "seed": 243,
+    "len_valid": 500
+  },
+  "chainer": {
+    "in": ["text_a", "text_b"],
+    "in_y": ["y"],
+    "pipe": [
+      {
+        "class_name": "torch_transformers_preprocessor",
+        "vocab_file": "{TRANSFORMER}",
+        "do_lower_case": false,
+        "max_seq_length": 64,
+        "in": ["text_a", "text_b"],
+        "out": ["bert_features"]
+      },
+      {
+        "class_name": "torch_transformers_classifier",
+        "n_classes": 2,
+        "return_probas": false,
+        "pretrained_bert": "{TRANSFORMER}",
+        "save_path": "{MODEL_PATH}/model",
+        "load_path": "{MODEL_PATH}/model",
+        "attention_probs_keep_prob": 0.11,
+        "hidden_keep_prob": 1.0, 
+        "optimizer": "AdamW",
+        "optimizer_parameters": {
+          "lr": 1.89e-5
+        },
+        "learning_rate_drop_patience": 3,
+        "learning_rate_drop_div": 1.5,
+        "in": [
+          "bert_features"
+        ],
+        "in_y": [
+          "y"
+        ],
+        "out": [
+          "predictions"
+        ]
+      }
+    ],
+    "out": ["predictions"]
+  },
+  "train": {
+    "epochs": 100,
+    "batch_size": 64,
+    "metrics": [
+        "f1",
+        "accuracy"
+    ],
+    "validation_patience": 7,
+    "val_every_n_batches": 50,
+    "log_every_n_batches": 50,
+    "evaluation_targets": [
+      "train",
+      "valid",
+      "test"
+    ],
+    "tensorboard_log_dir": "{MODEL_PATH}/",
+    "class_name": "torch_trainer"
+  },
+  "metadata": {
+    "variables": {
+      "ROOT_PATH": "~/.deeppavlov",
+      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
+      "TRANSFORMER": "DeepPavlov/distilrubert-tiny-cased-conversational",
+      "MODELS_PATH": "{ROOT_PATH}/models",
+      "MODEL_PATH": "{MODELS_PATH}/paraphraser_convers_distilrubert_2L_torch"
+    },
+    "download": [
+        {
+        "url": "http://files.deeppavlov.ai/deeppavlov_data/classifiers/paraphraser_convers_distilrubert_2L_torch.tar.gz",
+        "subdir": "{MODELS_PATH}"
+        }
+    ]
+  }
+}
diff --git a/deeppavlov/configs/classifiers/paraphraser_convers_distilrubert_6L_torch.json b/deeppavlov/configs/classifiers/paraphraser_convers_distilrubert_6L_torch.json
new file mode 100644
index 0000000000..ee21189915
--- /dev/null
+++ b/deeppavlov/configs/classifiers/paraphraser_convers_distilrubert_6L_torch.json
@@ -0,0 +1,85 @@
+{
+  "dataset_reader": {
+    "class_name": "paraphraser_reader",
+    "data_path": "{DOWNLOADS_PATH}/paraphraser_data",
+    "do_lower_case": false
+  },
+  "dataset_iterator": {
+    "class_name": "siamese_iterator",
+    "seed": 243,
+    "len_valid": 500
+  },
+  "chainer": {
+    "in": ["text_a", "text_b"],
+    "in_y": ["y"],
+    "pipe": [
+      {
+        "class_name": "torch_transformers_preprocessor",
+        "vocab_file": "{TRANSFORMER}",
+        "do_lower_case": false,
+        "max_seq_length": 64,
+        "in": ["text_a", "text_b"],
+        "out": ["bert_features"]
+      },
+      {
+        "class_name": "torch_transformers_classifier",
+        "n_classes": 2,
+        "return_probas": false,
+        "pretrained_bert": "{TRANSFORMER}",
+        "save_path": "{MODEL_PATH}/model",
+        "load_path": "{MODEL_PATH}/model",
+        "attention_probs_keep_prob": 0.0,
+        "hidden_keep_prob": 0.67, 
+        "optimizer": "AdamW",
+        "optimizer_parameters": {
+          "lr": 7.22e-5
+        },
+        "learning_rate_drop_patience": 3,
+        "learning_rate_drop_div": 1.5,
+        "in": [
+          "bert_features"
+        ],
+        "in_y": [
+          "y"
+        ],
+        "out": [
+          "predictions"
+        ]
+      }
+    ],
+    "out": ["predictions"]
+  },
+  "train": {
+    "epochs": 100,
+    "batch_size": 64,
+    "metrics": [
+        "f1",
+        "accuracy"
+    ],
+    "validation_patience": 7,
+    "val_every_n_batches": 50,
+    "log_every_n_batches": 50,
+    "evaluation_targets": [
+      "train",
+      "valid",
+      "test"
+    ],
+    "tensorboard_log_dir": "{MODEL_PATH}/",
+    "class_name": "torch_trainer"
+  },
+  "metadata": {
+    "variables": {
+      "ROOT_PATH": "~/.deeppavlov",
+      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
+      "TRANSFORMER": "DeepPavlov/distilrubert-base-cased-conversational",
+      "MODELS_PATH": "{ROOT_PATH}/models",
+      "MODEL_PATH": "{MODELS_PATH}/paraphraser_convers_distilrubert_6L_torch"
+    },
+    "download": [
+        {
+        "url": "http://files.deeppavlov.ai/deeppavlov_data/classifiers/paraphraser_convers_distilrubert_6L_torch.tar.gz",
+        "subdir": "{MODELS_PATH}"
+        }
+    ]
+  }
+} 

From e840a0b2fa75f62e1633e08f27f5fee083c0d6f9 Mon Sep 17 00:00:00 2001
From: ayeffkay <chatton1@mail.ru>
Date: Tue, 13 Jul 2021 18:35:54 +0300
Subject: [PATCH 03/17] Distil scores and configs added

---
 docs/features/overview.rst | 264 ++++++++++++++++++++-----------------
 1 file changed, 140 insertions(+), 124 deletions(-)

diff --git a/docs/features/overview.rst b/docs/features/overview.rst
index 31e822ff89..72f58015c3 100644
--- a/docs/features/overview.rst
+++ b/docs/features/overview.rst
@@ -20,27 +20,31 @@ The second model reproduces architecture from the paper `Application
 of a Hybrid Bi-LSTM-CRF model to the task of Russian Named Entity Recognition <https://arxiv.org/pdf/1709.09686.pdf>`__
 which is inspired by Bi-LSTM+CRF architecture from https://arxiv.org/pdf/1603.01360.pdf.
 
-+---------------------------------------------------------+-------+-----------------------------------------------------------------------------+-------------+
-| Dataset                                                 | Lang  | Model                                                                       |   Test F1   |
-+=========================================================+=======+=============================================================================+=============+
-| Persons-1000 dataset with additional LOC and ORG markup | Ru    | :config:`ner_rus_bert.json <ner/ner_rus_bert.json>`                         |    98.1     |
-+                                                         +       +-----------------------------------------------------------------------------+-------------+
-| (Collection 3)                                          |       | :config:`ner_rus.json <ner/ner_rus.json>`                                   |    95.1     |
-+---------------------------------------------------------+-------+-----------------------------------------------------------------------------+-------------+
-| Ontonotes                                               | Multi | :config:`ner_ontonotes_bert_mult.json <ner/ner_ontonotes_bert_mult.json>`   |    88.8     |
-+                                                         +-------+-----------------------------------------------------------------------------+-------------+
-|                                                         | En    | :config:`ner_ontonotes_bert.json <ner/ner_ontonotes_bert.json>`             |    88.6     |
-+                                                         +       +-----------------------------------------------------------------------------+-------------+
-|                                                         |       | :config:`ner_ontonotes.json <ner/ner_ontonotes.json>`                       |    87.1     |
-+---------------------------------------------------------+       +-----------------------------------------------------------------------------+-------------+
-| ConLL-2003                                              |       | :config:`ner_conll2003_bert.json <ner/ner_conll2003_bert.json>`             |    91.7     |
-+                                                         +       +-----------------------------------------------------------------------------+-------------+
-|                                                         |       | :config:`ner_conll2003_torch_bert.json <ner/ner_conll2003_torch_bert.json>` |    88.6     |
-+                                                         +       +-----------------------------------------------------------------------------+-------------+
-|                                                         |       | :config:`ner_conll2003.json <ner/ner_conll2003.json>`                       |    89.9     |
-+---------------------------------------------------------+       +-----------------------------------------------------------------------------+-------------+
-| DSTC2                                                   |       | :config:`ner_dstc2.json <ner/ner_dstc2.json>`                               |    97.1     |
-+---------------------------------------------------------+-------+-----------------------------------------------------------------------------+-------------+
++---------------------------------------------------------+-------+--------------------------------------------------------------------------------------------------------+-------------+
+| Dataset                                                 | Lang  | Model                                                                                                  |   Test F1   |
++=========================================================+=======+========================================================================================================+=============+
+| Persons-1000 dataset with additional LOC and ORG markup | Ru    | :config:`ner_rus_bert.json <ner/ner_rus_bert.json>`                                                    |    98.1     |
++                                                         +       +--------------------------------------------------------------------------------------------------------+-------------+
+| (Collection 3)                                          |       | :config:`ner_rus.json <ner/ner_rus.json>`                                                              |    95.1     |
++                                                         +       +--------------------------------------------------------------------------------------------------------+-------------+
+|                                                         |       | :config:`ner_rus_convers_distilrubert_2L_torch.json  <ner/ner_rus_convers_distilrubert_2L_torch.json>` |  88.4 ± 0.5 |                                                                                                                                                    
++                                                         +       +--------------------------------------------------------------------------------------------------------+-------------+
+|                                                         |       | :config:`ner_rus_convers_distilrubert_6L_torch.json  <ner/ner_rus_convers_distilrubert_6L_torch.json>` |  93.3 ± 0.3 |                                                         
++---------------------------------------------------------+-------+--------------------------------------------------------------------------------------------------------+-------------+
+| Ontonotes                                               | Multi | :config:`ner_ontonotes_bert_mult.json <ner/ner_ontonotes_bert_mult.json>`                              |    88.8     |
++                                                         +-------+--------------------------------------------------------------------------------------------------------+-------------+
+|                                                         | En    | :config:`ner_ontonotes_bert.json <ner/ner_ontonotes_bert.json>`                                        |    88.6     |
++                                                         +       +--------------------------------------------------------------------------------------------------------+-------------+
+|                                                         |       | :config:`ner_ontonotes.json <ner/ner_ontonotes.json>`                                                  |    87.1     |
++---------------------------------------------------------+       +--------------------------------------------------------------------------------------------------------+-------------+
+| ConLL-2003                                              |       | :config:`ner_conll2003_bert.json <ner/ner_conll2003_bert.json>`                                        |    91.7     |
++                                                         +       +--------------------------------------------------------------------------------------------------------+-------------+
+|                                                         |       | :config:`ner_conll2003_torch_bert.json <ner/ner_conll2003_torch_bert.json>`                            |    88.6     |
++                                                         +       +--------------------------------------------------------------------------------------------------------+-------------+
+|                                                         |       | :config:`ner_conll2003.json <ner/ner_conll2003.json>`                                                  |    89.9     |
++---------------------------------------------------------+       +--------------------------------------------------------------------------------------------------------+-------------+
+| DSTC2                                                   |       | :config:`ner_dstc2.json <ner/ner_dstc2.json>`                                                          |    97.1     |
++---------------------------------------------------------+-------+--------------------------------------------------------------------------------------------------------+-------------+
 
 Slot filling models :doc:`[docs] </features/models/slot_filling>`
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -63,61 +67,65 @@ BiLSTM with self-attention and other models are presented. The model also allows
 Several pre-trained models are available and presented in Table below.
 
 
-+------------------+--------------------+------+-------------------------------------------------------------------------------------------------+-------------+--------+--------+-----------+
-| Task             | Dataset            | Lang | Model                                                                                           | Metric      | Valid  | Test   | Downloads |
-+==================+====================+======+=================================================================================================+=============+========+========+===========+
-| 28 intents       | `DSTC 2`_          | En   | :config:`DSTC 2 emb <classifiers/intents_dstc2.json>`                                           | Accuracy    | 0.7613 | 0.7733 |  800 Mb   |
-+                  +                    +      +-------------------------------------------------------------------------------------------------+             +--------+--------+-----------+
-|                  |                    |      | :config:`Wiki emb <classifiers/intents_dstc2_big.json>`                                         |             | 0.9629 | 0.9617 |  8.5 Gb   |
-+                  +                    +      +-------------------------------------------------------------------------------------------------+             +--------+--------+-----------+
-|                  |                    |      | :config:`BERT <classifiers/intents_dstc2_bert.json>`                                            |             | 0.9673 | 0.9636 |  800 Mb   |
-+------------------+--------------------+      +-------------------------------------------------------------------------------------------------+-------------+--------+--------+-----------+
-| 7 intents        | `SNIPS-2017`_ [1]_ |      | :config:`DSTC 2 emb <classifiers/intents_snips.json>`                                           | F1-macro    | 0.8591 |    --  |  800 Mb   |
-+                  +                    +      +-------------------------------------------------------------------------------------------------+             +--------+--------+-----------+
-|                  |                    |      | :config:`Wiki emb <classifiers/intents_snips_big.json>`                                         |             | 0.9820 |    --  |  8.5 Gb   |
-+                  +                    +      +-------------------------------------------------------------------------------------------------+             +--------+--------+-----------+
-|                  |                    |      | :config:`Tfidf + SelectKBest + PCA + Wiki emb <classifiers/intents_snips_sklearn.json>`         |             | 0.9673 |    --  |  8.6 Gb   |
-+                  +                    +      +-------------------------------------------------------------------------------------------------+             +--------+--------+-----------+
-|                  |                    |      | :config:`Wiki emb weighted by Tfidf <classifiers/intents_snips_tfidf_weighted.json>`            |             | 0.9786 |    --  |  8.5 Gb   |
-+------------------+--------------------+      +-------------------------------------------------------------------------------------------------+-------------+--------+--------+-----------+
-| Insult detection | `Insults`_         |      | :config:`Reddit emb <classifiers/insults_kaggle.json>`                                          | ROC-AUC     | 0.9263 | 0.8556 |  6.2 Gb   |
-+                  +                    +      +-------------------------------------------------------------------------------------------------+             +--------+--------+-----------+
-|                  |                    |      | :config:`English BERT <classifiers/insults_kaggle_bert.json>`                                   |             | 0.9255 | 0.8612 |  1200 Mb  |
-+                  +                    +      +-------------------------------------------------------------------------------------------------+             +--------+--------+-----------+
-|                  |                    |      | :config:`English Conversational BERT <classifiers/insults_kaggle_conv_bert.json>`               |             | 0.9389 | 0.8941 |  1200 Mb  |
-+                  +                    +      +-------------------------------------------------------------------------------------------------+             +--------+--------+-----------+
-|                  |                    |      | :config:`English BERT on PyTorch <classifiers/insults_kaggle_bert_torch.json>`                  |             | 0.9329 | 0.877  |  1.1 Gb   |
-+------------------+--------------------+      +-------------------------------------------------------------------------------------------------+-------------+--------+--------+-----------+
-| 5 topics         | `AG News`_         |      | :config:`Wiki emb <classifiers/topic_ag_news.json>`                                             | Accuracy    | 0.8922 | 0.9059 |  8.5 Gb   |
-+------------------+--------------------+      +-------------------------------------------------------------------------------------------------+-------------+--------+--------+-----------+
-| Intent           |`Yahoo-L31`_        |      | :config:`Yahoo-L31 on conversational BERT <classifiers/yahoo_convers_vs_info_bert.json>`        | ROC-AUC     | 0.9436 |   --   |  1200 Mb  |
-+------------------+--------------------+      +-------------------------------------------------------------------------------------------------+-------------+--------+--------+-----------+
-| Sentiment        |`SST`_              |      | :config:`5-classes SST on conversational BERT <classifiers/sentiment_sst_conv_bert.json>`       | Accuracy    | 0.6456 | 0.6715 |  400 Mb   |
-+                  +                    +      +-------------------------------------------------------------------------------------------------+             +--------+--------+-----------+
-|                  |                    |      | :config:`5-classes SST on multilingual BERT <classifiers/sentiment_sst_multi_bert.json>`        |             | 0.5738 | 0.6024 |  660 Mb   |
-+                  +                    +      +-------------------------------------------------------------------------------------------------+             +--------+--------+-----------+
-|                  |                    |      | :config:`3-classes SST SWCNN on PyTorch <classifiers/sst_torch_swcnn.json>`                     |             | 0.7379 | 0.6312 |  4.3 Mb   |
-+                  +--------------------+      +-------------------------------------------------------------------------------------------------+             +--------+--------+-----------+
-|                  |`Yelp`_             |      | :config:`5-classes Yelp on conversational BERT <classifiers/sentiment_yelp_conv_bert.json>`     |             | 0.6925 | 0.6842 |  400 Mb   |
-+                  +                    +      +-------------------------------------------------------------------------------------------------+             +--------+--------+-----------+
-|                  |                    |      | :config:`5-classes Yelp on multilingual BERT <classifiers/sentiment_yelp_multi_bert.json>`      |             | 0.5896 | 0.5874 |  660 Mb   |
-+------------------+--------------------+------+-------------------------------------------------------------------------------------------------+-------------+--------+--------+-----------+
-| Sentiment        |`Twitter mokoron`_  | Ru   | :config:`RuWiki+Lenta emb w/o preprocessing <classifiers/sentiment_twitter.json>`               |             | 0.9965 | 0.9961 |  6.2 Gb   |
-+                  +                    +      +-------------------------------------------------------------------------------------------------+             +--------+--------+-----------+
-|                  |                    |      | :config:`RuWiki+Lenta emb with preprocessing <classifiers/sentiment_twitter_preproc.json>`      |             | 0.7823 | 0.7759 |  6.2 Gb   |
-+                  +--------------------+      +-------------------------------------------------------------------------------------------------+-------------+--------+--------+-----------+
-|                  |`RuSentiment`_      |      | :config:`RuWiki+Lenta emb <classifiers/rusentiment_cnn.json>`                                   | F1-weighted | 0.6541 | 0.7016 |  6.2 Gb   |
-+                  +                    +      +-------------------------------------------------------------------------------------------------+             +--------+--------+-----------+
-|                  |                    |      | :config:`Twitter emb super-convergence <classifiers/rusentiment_bigru_superconv.json>` [2]_     |             | 0.7301 | 0.7576 |  3.4 Gb   |
-+                  +                    +      +-------------------------------------------------------------------------------------------------+             +--------+--------+-----------+
-|                  |                    |      | :config:`ELMo <classifiers/rusentiment_elmo_twitter_cnn.json>`                                  |             | 0.7519 | 0.7875 |  700 Mb   |
-+                  +                    +      +-------------------------------------------------------------------------------------------------+             +--------+--------+-----------+
-|                  |                    |      | :config:`Multi-language BERT <classifiers/rusentiment_bert.json>`                               |             | 0.6809 | 0.7193 |  1900 Mb  |
-+                  +                    +      +-------------------------------------------------------------------------------------------------+             +--------+--------+-----------+
-|                  |                    |      | :config:`Conversational RuBERT <classifiers/rusentiment_convers_bert.json>`                     |             | 0.7548 | 0.7742 |  657 Mb   |
-+------------------+--------------------+      +-------------------------------------------------------------------------------------------------+-------------+--------+--------+-----------+
-| Intent           |Ru like`Yahoo-L31`_ |      | :config:`Conversational vs Informational on ELMo <classifiers/yahoo_convers_vs_info.json>`      | ROC-AUC     | 0.9412 |   --   |  700 Mb   |
-+------------------+--------------------+------+-------------------------------------------------------------------------------------------------+-------------+--------+--------+-----------+
++------------------+---------------------+------+----------------------------------------------------------------------------------------------------------+-------------+------------------+-----------------+-----------+
+| Task             | Dataset             | Lang | Model                                                                                                    | Metric      | Valid            | Test            | Downloads |
++==================+=====================+======+==========================================================================================================+=============+==================+=================+===========+
+| 28 intents       | `DSTC 2`_           | En   | :config:`DSTC 2 emb <classifiers/intents_dstc2.json>`                                                    | Accuracy    | 0.7613           | 0.7733          |  800 Mb   |
++                  +                     +      +----------------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
+|                  |                     |      | :config:`Wiki emb <classifiers/intents_dstc2_big.json>`                                                  |             | 0.9629           | 0.9617          |  8.5 Gb   |
++                  +                     +      +----------------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
+|                  |                     |      | :config:`BERT <classifiers/intents_dstc2_bert.json>`                                                     |             | 0.9673           | 0.9636          |  800 Mb   |
++------------------+---------------------+      +----------------------------------------------------------------------------------------------------------+-------------+------------------+-----------------+-----------+
+| 7 intents        | `SNIPS-2017`_ [1]_  |      | :config:`DSTC 2 emb <classifiers/intents_snips.json>`                                                    | F1-macro    | 0.8591           |    --           |  800 Mb   |
++                  +                     +      +----------------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
+|                  |                     |      | :config:`Wiki emb <classifiers/intents_snips_big.json>`                                                  |             | 0.9820           |    --           |  8.5 Gb   |
++                  +                     +      +----------------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
+|                  |                     |      | :config:`Tfidf + SelectKBest + PCA + Wiki emb <classifiers/intents_snips_sklearn.json>`                  |             | 0.9673           |    --           |  8.6 Gb   |
++                  +                     +      +----------------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
+|                  |                     |      | :config:`Wiki emb weighted by Tfidf <classifiers/intents_snips_tfidf_weighted.json>`                     |             | 0.9786           |    --           |  8.5 Gb   |
++------------------+---------------------+      +----------------------------------------------------------------------------------------------------------+-------------+------------------+-----------------+-----------+
+| Insult detection | `Insults`_          |      | :config:`Reddit emb <classifiers/insults_kaggle.json>`                                                   | ROC-AUC     | 0.9263           | 0.8556          |  6.2 Gb   |
++                  +                     +      +----------------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
+|                  |                     |      | :config:`English BERT <classifiers/insults_kaggle_bert.json>`                                            |             | 0.9255           | 0.8612          |  1200 Mb  |
++                  +                     +      +----------------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
+|                  |                     |      | :config:`English Conversational BERT <classifiers/insults_kaggle_conv_bert.json>`                        |             | 0.9389           | 0.8941          |  1200 Mb  |
++                  +                     +      +----------------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
+|                  |                     |      | :config:`English BERT on PyTorch <classifiers/insults_kaggle_bert_torch.json>`                           |             | 0.9329           | 0.877           |  1.1 Gb   |
++------------------+---------------------+      +----------------------------------------------------------------------------------------------------------+-------------+------------------+-----------------+-----------+
+| 5 topics         | `AG News`_          |      | :config:`Wiki emb <classifiers/topic_ag_news.json>`                                                      | Accuracy    | 0.8922           | 0.9059          |  8.5 Gb   |
++------------------+---------------------+      +----------------------------------------------------------------------------------------------------------+-------------+------------------+-----------------+-----------+
+| Intent           | `Yahoo-L31`_        |      | :config:`Yahoo-L31 on conversational BERT <classifiers/yahoo_convers_vs_info_bert.json>`                 | ROC-AUC     | 0.9436           |   --            |  1200 Mb  |
++------------------+---------------------+      +----------------------------------------------------------------------------------------------------------+-------------+------------------+-----------------+-----------+
+| Sentiment        | `SST`_              |      | :config:`5-classes SST on conversational BERT <classifiers/sentiment_sst_conv_bert.json>`                | Accuracy    | 0.6456           | 0.6715          |  400 Mb   |
++                  +                     +      +----------------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
+|                  |                     |      | :config:`5-classes SST on multilingual BERT <classifiers/sentiment_sst_multi_bert.json>`                 |             | 0.5738           | 0.6024          |  660 Mb   |
++                  +                     +      +----------------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
+|                  |                     |      | :config:`3-classes SST SWCNN on PyTorch <classifiers/sst_torch_swcnn.json>`                              |             | 0.7379           | 0.6312          |  4.3 Mb   |
++                  +---------------------+      +----------------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
+|                  | `Yelp`_             |      | :config:`5-classes Yelp on conversational BERT <classifiers/sentiment_yelp_conv_bert.json>`              |             | 0.6925           | 0.6842          |  400 Mb   |
++                  +                     +      +----------------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
+|                  |                     |      | :config:`5-classes Yelp on multilingual BERT <classifiers/sentiment_yelp_multi_bert.json>`               |             | 0.5896           | 0.5874          |  660 Mb   |
++------------------+---------------------+------+----------------------------------------------------------------------------------------------------------+-------------+------------------+-----------------+-----------+
+| Sentiment        | `Twitter mokoron`_  | Ru   | :config:`RuWiki+Lenta emb w/o preprocessing <classifiers/sentiment_twitter.json>`                        |             | 0.9965           | 0.9961          |  6.2 Gb   |
++                  +                     +      +----------------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
+|                  |                     |      | :config:`RuWiki+Lenta emb with preprocessing <classifiers/sentiment_twitter_preproc.json>`               |             | 0.7823           | 0.7759          |  6.2 Gb   |
++                  +---------------------+      +----------------------------------------------------------------------------------------------------------+-------------+------------------+-----------------+-----------+
+|                  | `RuSentiment`_      |      | :config:`RuWiki+Lenta emb <classifiers/rusentiment_cnn.json>`                                            | F1-weighted | 0.6541           | 0.7016          |  6.2 Gb   |
++                  +                     +      +----------------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
+|                  |                     |      | :config:`Twitter emb super-convergence <classifiers/rusentiment_bigru_superconv.json>` [2]_              |             | 0.7301           | 0.7576          |  3.4 Gb   |
++                  +                     +      +----------------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
+|                  |                     |      | :config:`ELMo <classifiers/rusentiment_elmo_twitter_cnn.json>`                                           |             | 0.7519           | 0.7875          |  700 Mb   |
++                  +                     +      +----------------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
+|                  |                     |      | :config:`Multi-language BERT <classifiers/rusentiment_bert.json>`                                        |             | 0.6809           | 0.7193          |  1900 Mb  |
++                  +                     +      +----------------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
+|                  |                     |      | :config:`Conversational RuBERT <classifiers/rusentiment_convers_bert.json>`                              |             | 0.7548           | 0.7742          |  657 Mb   |
++                  +                     +      +----------------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
+|                  |                     |      | :config:`Conversational DistilRuBERT-tiny <classifiers/rusentiment_convers_distilrubert_2L_torch.json>`  |             |  0.703 ± 0.0031  | 0.7348 ± 0.0028 |  690 Mb   |
++                  +                     +      +----------------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
+|                  |                     |      | :config:`Conversational DistilRuBERT-base <classifiers/rusentiment_convers_distilrubert_6L_torch.json>`  |             |  0.7376 ± 0.0045 | 0.7645 ± 0.035  |  1.0 Gb   |
++------------------+---------------------+      +----------------------------------------------------------------------------------------------------------+-------------+------------------+-----------------+-----------+
+| Intent           | Ru like`Yahoo-L31`_ |      | :config:`Conversational vs Informational on ELMo <classifiers/yahoo_convers_vs_info.json>`               | ROC-AUC     | 0.9412           |   --            |  700 Mb   |
++------------------+---------------------+------+----------------------------------------------------------------------------------------------------------+-------------+------------------+-----------------+-----------+
 
 .. [1] Coucke A. et al. Snips voice platform: an embedded spoken language understanding system for private-by-design voice interfaces //arXiv preprint arXiv:1805.10190. – 2018.
 .. [2] Smith L. N., Topin N. Super-convergence: Very fast training of residual networks using large learning rates. – 2018.
@@ -231,11 +239,11 @@ Available pre-trained models for ranking:
    +-------------------+----------------------------------------------------------------------------------------------------------------------+-----------+-------+-------+-------+-----------+
    | `Ubuntu V2`_      | :config:`ranking_ubuntu_v2_mt_word2vec_smn <ranking/ranking_ubuntu_v2_mt_word2vec_smn.json>`                         |   68.56   | 67.91 | 81.49 | 95.63 |  1609 MB  |
    +-------------------+----------------------------------------------------------------------------------------------------------------------+-----------+-------+-------+-------+-----------+
-   | `Ubuntu V2`_      |:config:`ranking_ubuntu_v2_bert_uncased <ranking/ranking_ubuntu_v2_bert_uncased.json>`                                |   66.5    | 66.6  | --    | --    |  396 MB   |
+   | `Ubuntu V2`_      | :config:`ranking_ubuntu_v2_bert_uncased <ranking/ranking_ubuntu_v2_bert_uncased.json>`                               |   66.5    | 66.6  | --    | --    |  396 MB   |
    +-------------------+----------------------------------------------------------------------------------------------------------------------+-----------+-------+-------+-------+-----------+
-   | `Ubuntu V2`_      |:config:`ranking_ubuntu_v2_bert_uncased on PyTorch <ranking/ranking_ubuntu_v2_torch_bert_uncased.json>`               |   65.73   | 65.74 | --    | --    |  1.1 Gb   |
+   | `Ubuntu V2`_      | :config:`ranking_ubuntu_v2_bert_uncased on PyTorch <ranking/ranking_ubuntu_v2_torch_bert_uncased.json>`              |   65.73   | 65.74 | --    | --    |  1.1 Gb   |
    +-------------------+----------------------------------------------------------------------------------------------------------------------+-----------+-------+-------+-------+-----------+
-   | `Ubuntu V2`_      |:config:`ranking_ubuntu_v2_bert_sep <ranking/ranking_ubuntu_v2_bert_sep.json>`                                        |   66.5    | 66.5  | --    | --    |  396 MB   |
+   | `Ubuntu V2`_      | :config:`ranking_ubuntu_v2_bert_sep <ranking/ranking_ubuntu_v2_bert_sep.json>`                                       |   66.5    | 66.5  | --    | --    |  396 MB   |
    +-------------------+----------------------------------------------------------------------------------------------------------------------+-----------+-------+-------+-------+-----------+
    | `Ubuntu V2`_      | :config:`ranking_ubuntu_v2_mt_interact <ranking/ranking_ubuntu_v2_mt_interact.json>`                                 |   59.2    | 58.7  | --    | --    |  8906 MB  |
    +-------------------+----------------------------------------------------------------------------------------------------------------------+-----------+-------+-------+-------+-----------+
@@ -247,15 +255,19 @@ Available pre-trained models for paraphrase identification:
 .. table::
    :widths: auto
 
-   +------------------------+-----------------------------------------------------------------------------------------------+---------------+----------------+---------+----------+---------------+----------------+----------+
-   |    Dataset             |Model config                                                                                   | Val (accuracy)| Test (accuracy)| Val (F1)| Test (F1)| Val (log_loss)| Test (log_loss)|Downloads |
-   +========================+===============================================================================================+===============+================+=========+==========+===============+================+==========+
-   |`paraphraser.ru`_       |:config:`paraphrase_ident_paraphraser_ft <ranking/paraphrase_ident_paraphraser_interact.json>` |   83.8        |   75.4         |   87.9  |  80.9    |   0.468       |   0.616        |5938M     |
-   +------------------------+-----------------------------------------------------------------------------------------------+---------------+----------------+---------+----------+---------------+----------------+----------+
-   |`paraphraser.ru`_       |:config:`paraphrase_bert_multilingual <classifiers/paraphraser_bert.json>`                     |   87.4        |   79.3         |   90.2  |  83.4    |   --          |   --           |1330M     |
-   +------------------------+-----------------------------------------------------------------------------------------------+---------------+----------------+---------+----------+---------------+----------------+----------+
-   |`paraphraser.ru`_       |:config:`paraphrase_rubert <classifiers/paraphraser_rubert.json>`                              |   90.2        |   84.9         |   92.3  |  87.9    |   --          |   --           |1325M     |
-   +------------------------+-----------------------------------------------------------------------------------------------+---------------+----------------+---------+----------+---------------+----------------+----------+
+   +------------------------+------------------------------------------------------------------------------------------------------------------+----------------+-----------------+------------+------------+----------------+-----------------+-----------+
+   |    Dataset             | Model config                                                                                                     | Val (accuracy) | Test (accuracy) | Val (F1)   | Test (F1)  | Val (log_loss) | Test (log_loss) | Downloads |
+   +========================+==================================================================================================================+================+=================+============+============+================+=================+===========+
+   | `paraphraser.ru`_      | :config:`paraphrase_ident_paraphraser_ft <ranking/paraphrase_ident_paraphraser_interact.json>`                   |   83.8         |   75.4          |   87.9     |  80.9      |   0.468        |   0.616         | 5938M     |
+   +------------------------+------------------------------------------------------------------------------------------------------------------+----------------+-----------------+------------+------------+----------------+-----------------+-----------+
+   | `paraphraser.ru`_      | :config:`paraphrase_bert_multilingual <classifiers/paraphraser_bert.json>`                                       |   87.4         |   79.3          |   90.2     |  83.4      |   --           |   --            | 1330M     |
+   +------------------------+------------------------------------------------------------------------------------------------------------------+----------------+-----------------+------------+------------+----------------+-----------------+-----------+
+   | `paraphraser.ru`_      | :config:`paraphrase_rubert <classifiers/paraphraser_rubert.json>`                                                |   90.2         |   84.9          |   92.3     |  87.9      |   --           |   --            | 1325M     |
+   +------------------------+------------------------------------------------------------------------------------------------------------------+----------------+-----------------+------------+------------+----------------+-----------------+-----------+
+   | `paraphraser.ru`_      | :config:`paraphraser_convers_distilrubert_2L_torch <classifiers/paraphraser_convers_distilrubert_2L_torch.json>` |  76.1 ± 0.2    |  64.5 ± 0.5     | 81.8 ± 0.2 | 73.9 ± 0.8 |   --           |   --            | 618M      |
+   +------------------------+------------------------------------------------------------------------------------------------------------------+----------------+-----------------+------------+------------+----------------+-----------------+-----------+
+   | `paraphraser.ru`_      | :config:`paraphraser_convers_distilrubert_2L_torch <classifiers/paraphraser_convers_distilrubert_6L_torch.json>` |  86.5 ± 0.5    |  78.9 ± 0.4     | 89.6 ± 0.3 | 83.2 ± 0.5 |   --           |   --            | 930M      |
+   +------------------------+------------------------------------------------------------------------------------------------------------------+----------------+-----------------+------------+------------+----------------+-----------------+-----------+
 
 .. _`paraphraser.ru`: https://paraphraser.ru/
 
@@ -319,25 +331,29 @@ BERT-based model is described in  `BERT: Pre-training of Deep Bidirectional Tran
 R-Net model is based on `R-NET: Machine Reading Comprehension with Self-matching Networks
 <https://www.microsoft.com/en-us/research/publication/mcr/>`__.
 
-+---------------+------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
-|    Dataset    | Model config                                                           | lang  |    EM (dev)    |    F-1 (dev)    |    Downloads    |
-+===============+========================================================================+=======+================+=================+=================+
-| `SQuAD-v1.1`_ | :config:`DeepPavlov BERT <squad/squad_bert.json>`                      |  en   |     80.88      |     88.49       |     806Mb       |
-+---------------+------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
-| `SQuAD-v1.1`_ | :config:`DeepPavlov BERT on PyTorch <squad/squad_torch_bert.json>`     |  en   |    80.79       |   88.30         |     1.1 Gb      |
-+---------------+------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
-| `SQuAD-v1.1`_ | :config:`DeepPavlov R-Net <squad/squad.json>`                          |  en   |     71.49      |     80.34       |     ~2.5Gb      |
-+---------------+------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
-|  SDSJ Task B  | :config:`DeepPavlov RuBERT <squad/squad_ru_bert_infer.json>`           |  ru   |  66.30+-0.24   |   84.60+-0.11   |   1325Mb        |
-+---------------+------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
-|  SDSJ Task B  | :config:`DeepPavlov multilingual BERT <squad/squad_ru_bert_infer.json>`|  ru   |  64.35+-0.39   |   83.39+-0.08   |   1323Mb        |
-+---------------+------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
-|  SDSJ Task B  | :config:`DeepPavlov R-Net <squad/squad_ru.json>`                       |  ru   |     60.62      |     80.04       |     ~5Gb        |
-+---------------+------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
-|    `DRCD`_    | :config:`DeepPavlov multilingual BERT <squad/squad_zh_bert_mult.json>` |  ch   |     84.86      |     89.03       |     630Mb       |
-+---------------+------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
-|    `DRCD`_    | :config:`DeepPavlov Chinese BERT <squad/squad_zh_bert_zh.json>`        |  ch   |     84.19      |     89.23       |     362Mb       |
-+---------------+------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
++---------------+---------------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
+|    Dataset    | Model config                                                                                      | lang  |    EM (dev)    |    F-1 (dev)    |    Downloads    |
++===============+===================================================================================================+=======+================+=================+=================+
+| `SQuAD-v1.1`_ | :config:`DeepPavlov BERT <squad/squad_bert.json>`                                                 |  en   |     80.88      |     88.49       |     806Mb       |
++---------------+---------------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
+| `SQuAD-v1.1`_ | :config:`DeepPavlov BERT on PyTorch <squad/squad_torch_bert.json>`                                |  en   |    80.79       |     88.30       |     1.1 Gb      |
++---------------+---------------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
+| `SQuAD-v1.1`_ | :config:`DeepPavlov R-Net <squad/squad.json>`                                                     |  en   |     71.49      |     80.34       |     ~2.5Gb      |
++---------------+---------------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
+|  SDSJ Task B  | :config:`DeepPavlov RuBERT <squad/squad_ru_bert_infer.json>`                                      |  ru   |  66.30 ± 0.24  |   84.60 ± 0.11  |     1325Mb      |
++---------------+---------------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
+|  SDSJ Task B  | :config:`DeepPavlov multilingual BERT <squad/squad_ru_bert_infer.json>`                           |  ru   |  64.35 ± 0.39  |   83.39 ± 0.08  |     1323Mb      |
++---------------+---------------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
+|  SDSJ Task B  | :config:`DeepPavlov R-Net <squad/squad_ru.json>`                                                  |  ru   |     60.62      |     80.04       |     ~5Gb        |
++---------------+---------------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
+|  SDSJ Task B  | :config:`DeepPavlov DistilRuBERT-tiny <squad/squad_ru_convers_distilrubert_2L_torch_infer.json>`  |  ru   |  44.2 ± 0.46   |  65.1 ± 0.36    |     867Mb       |
++---------------+---------------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
+|  SDSJ Task B  | :config:`DeepPavlov DistilRuBERT-base <squad/squad_ru_convers_distilrubert_6L_torch_infer.json>`  |  ru   |  61.23 ± 0.42  |  80.36 ± 0.28   |     1.18Gb      |
++---------------+---------------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
+|    `DRCD`_    | :config:`DeepPavlov multilingual BERT <squad/squad_zh_bert_mult.json>`                            |  ch   |     84.86      |     89.03       |     630Mb       |
++---------------+---------------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
+|    `DRCD`_    | :config:`DeepPavlov Chinese BERT <squad/squad_zh_bert_zh.json>`                                   |  ch   |     84.19      |     89.23       |     362Mb       |
++---------------+---------------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
 
 In the case when answer is not necessary present in given context we have :config:`squad_noans <squad/multi_squad_noans.json>`
 model. This model outputs empty string in case if there is no answer in context.
@@ -361,31 +377,31 @@ For more scores see :doc:`full table </features/models/morphotagger>`.
     +----------------------+--------------------------------------------------------------------------------------------------------------+---------------+----------------+--------------------+
     |    Dataset           | Model                                                                                                        | Word accuracy | Sent. accuracy | Download size (MB) |
     +======================+==============================================================================================================+===============+================+====================+
-    |`UD2.3`_ (Russian)    |`UD Pipe 2.3`_ (Straka et al., 2017)                                                                          |    93.5       |                |                    |
+    | `UD2.3`_ (Russian)   | `UD Pipe 2.3`_ (Straka et al., 2017)                                                                         |    93.5       |                |                    |
     |                      +--------------------------------------------------------------------------------------------------------------+---------------+----------------+--------------------+
     |                      | `UD Pipe Future`_ (Straka et al., 2018)                                                                      |    96.90      |                |                    |
     |                      +--------------------------------------------------------------------------------------------------------------+---------------+----------------+--------------------+
-    |                      |:config:`BERT-based model <morpho_tagger/BERT/morpho_ru_syntagrus_bert.json>`                                 |    97.83      |     72.02      |       661          |
+    |                      | :config:`BERT-based model <morpho_tagger/BERT/morpho_ru_syntagrus_bert.json>`                                |    97.83      |     72.02      |       661          |
     +----------------------+--------------------------------------------------------------------------------------------------------------+---------------+----------------+--------------------+
-    |                      |`Pymorphy`_ + `russian_tagsets`_ (first tag)                                                                  |     60.93     |      0.00      |                    |
+    |                      | `Pymorphy`_ + `russian_tagsets`_ (first tag)                                                                 |     60.93     |      0.00      |                    |
     +                      +--------------------------------------------------------------------------------------------------------------+---------------+----------------+--------------------+
-    |`UD2.0`_ (Russian)    |`UD Pipe 1.2`_ (Straka et al., 2017)                                                                          |     93.57     |     43.04      |                    |
+    | `UD2.0`_ (Russian)   | `UD Pipe 1.2`_ (Straka et al., 2017)                                                                         |     93.57     |     43.04      |                    |
     +                      +--------------------------------------------------------------------------------------------------------------+---------------+----------------+--------------------+
-    |                      |:config:`Basic model <morpho_tagger/UD2.0/ru_syntagrus/morpho_ru_syntagrus.json>`                             |     95.17     |     50.58      |        48.7        |
+    |                      | :config:`Basic model <morpho_tagger/UD2.0/ru_syntagrus/morpho_ru_syntagrus.json>`                            |     95.17     |     50.58      |        48.7        |
     +                      +--------------------------------------------------------------------------------------------------------------+---------------+----------------+--------------------+
-    |                      |:config:`Pymorphy-enhanced model <morpho_tagger/UD2.0/ru_syntagrus/morpho_ru_syntagrus_pymorphy.json>`        |   **96.23**   |     58.00      |        48.7        |
+    |                      | :config:`Pymorphy-enhanced model <morpho_tagger/UD2.0/ru_syntagrus/morpho_ru_syntagrus_pymorphy.json>`       |   **96.23**   |     58.00      |        48.7        |
     +----------------------+--------------------------------------------------------------------------------------------------------------+---------------+----------------+--------------------+
-    | `UD2.0`_ (Czech)     |`UD Pipe 1.2`_ (Straka et al., 2017)                                                                          |     91.86     |     42.28      |                    |
+    | `UD2.0`_ (Czech)     | `UD Pipe 1.2`_ (Straka et al., 2017)                                                                         |     91.86     |     42.28      |                    |
     |                      +--------------------------------------------------------------------------------------------------------------+---------------+----------------+--------------------+
-    |                      |:config:`Basic model <morpho_tagger/UD2.0/morpho_cs.json>`                                                    |   **94.35**   |     51.56      |        41.8        |
+    |                      | :config:`Basic model <morpho_tagger/UD2.0/morpho_cs.json>`                                                   |   **94.35**   |     51.56      |        41.8        |
     +----------------------+--------------------------------------------------------------------------------------------------------------+---------------+----------------+--------------------+
-    |`UD2.0`_ (English)    |`UD Pipe 1.2`_ (Straka et al., 2017)                                                                          |     92.89     |     55.75      |                    |
+    | `UD2.0`_ (English)   | `UD Pipe 1.2`_ (Straka et al., 2017)                                                                         |     92.89     |     55.75      |                    |
     |                      +--------------------------------------------------------------------------------------------------------------+---------------+----------------+--------------------+
-    |                      |:config:`Basic model <morpho_tagger/UD2.0/morpho_en.json>`                                                    |   **93.00**   |     55.18      |        16.9        |
+    |                      | :config:`Basic model <morpho_tagger/UD2.0/morpho_en.json>`                                                   |   **93.00**   |     55.18      |        16.9        |
     +----------------------+--------------------------------------------------------------------------------------------------------------+---------------+----------------+--------------------+
-    |`UD2.0`_ (German)     |`UD Pipe 1.2`_ (Straka et al., 2017)                                                                          |     76.65     |     10.24      |                    |
+    | `UD2.0`_ (German)    | `UD Pipe 1.2`_ (Straka et al., 2017)                                                                         |     76.65     |     10.24      |                    |
     |                      +--------------------------------------------------------------------------------------------------------------+---------------+----------------+--------------------+
-    |                      |:config:`Basic model <morpho_tagger/UD2.0/morpho_de.json>`                                                    |   **83.83**   |     15.25      |        18.6        |
+    |                      | :config:`Basic model <morpho_tagger/UD2.0/morpho_de.json>`                                                   |   **83.83**   |     15.25      |        18.6        |
     +----------------------+--------------------------------------------------------------------------------------------------------------+---------------+----------------+--------------------+
 
 .. _`Pymorphy`: https://pymorphy2.readthedocs.io/en/latest/
@@ -415,7 +431,7 @@ on ``ru_syntagrus`` Russian corpus (version UD 2.3).
     |                         +-------------------------------------------------------------------------------------------+---------+----------+
     |                         | `UDify (multilingual BERT)`_ (Kondratyuk, 2018)                                           | 94.8    | 93.1     |
     |                         +-------------------------------------------------------------------------------------------+---------+----------+
-    |                         |:config:`our BERT model <syntax/syntax_ru_syntagrus_bert.json>`                            | 95.2    | 93.7     |
+    |                         | :config:`our BERT model <syntax/syntax_ru_syntagrus_bert.json>`                           | 95.2    | 93.7     |
     +-------------------------+-------------------------------------------------------------------------------------------+---------+----------+
 
 .. _`UD2.3`: http://hdl.handle.net/11234/1-2895
@@ -473,13 +489,13 @@ based on its Wikipedia knowledge.
 +----------------+--------------------------------------------------------------------+-----------------------+--------+-----------+
 | Dataset        | Model config                                                       |  Wiki dump            |   F1   | Downloads |
 +================+====================================================================+=======================+========+===========+
-| `SQuAD-v1.1`_  |:config:`ODQA <odqa/en_odqa_infer_wiki.json>`                       | enwiki (2018-02-11)   |  35.89 | 9.7Gb     |
+| `SQuAD-v1.1`_  | :config:`ODQA <odqa/en_odqa_infer_wiki.json>`                      | enwiki (2018-02-11)   |  35.89 | 9.7Gb     |
 +----------------+--------------------------------------------------------------------+-----------------------+--------+-----------+
-| `SQuAD-v1.1`_  |:config:`ODQA <odqa/en_odqa_infer_enwiki20161221.json>`             | enwiki (2016-12-21)   |  37.83 | 9.3Gb     |
+| `SQuAD-v1.1`_  | :config:`ODQA <odqa/en_odqa_infer_enwiki20161221.json>`            | enwiki (2016-12-21)   |  37.83 | 9.3Gb     |
 +----------------+--------------------------------------------------------------------+-----------------------+--------+-----------+
-| `SDSJ Task B`_ |:config:`ODQA <odqa/ru_odqa_infer_wiki.json>`                       | ruwiki (2018-04-01)   |  28.56 | 7.7Gb     |
+| `SDSJ Task B`_ | :config:`ODQA <odqa/ru_odqa_infer_wiki.json>`                      | ruwiki (2018-04-01)   |  28.56 | 7.7Gb     |
 +----------------+--------------------------------------------------------------------+-----------------------+--------+-----------+
-| `SDSJ Task B`_ |:config:`ODQA with RuBERT <odqa/ru_odqa_infer_wiki_rubert.json>`    | ruwiki (2018-04-01)   |  37.83 | 4.3Gb     |
+| `SDSJ Task B`_ | :config:`ODQA with RuBERT <odqa/ru_odqa_infer_wiki_rubert.json>`   | ruwiki (2018-04-01)   |  37.83 | 4.3Gb     |
 +----------------+--------------------------------------------------------------------+-----------------------+--------+-----------+
 
 

From 544508d616daca02012e4c67559a4ddf36a4af16 Mon Sep 17 00:00:00 2001
From: ayeffkay <chatton1@mail.ru>
Date: Tue, 13 Jul 2021 18:36:51 +0300
Subject: [PATCH 04/17] Distil configs for paraphraser

---
 .../classifiers/paraphraser_convers_distilrubert_2L_torch.json  | 2 +-
 .../classifiers/paraphraser_convers_distilrubert_6L_torch.json  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/deeppavlov/configs/classifiers/paraphraser_convers_distilrubert_2L_torch.json b/deeppavlov/configs/classifiers/paraphraser_convers_distilrubert_2L_torch.json
index 4e51808e89..aabfa9d7b5 100644
--- a/deeppavlov/configs/classifiers/paraphraser_convers_distilrubert_2L_torch.json
+++ b/deeppavlov/configs/classifiers/paraphraser_convers_distilrubert_2L_torch.json
@@ -32,7 +32,7 @@
         "hidden_keep_prob": 1.0, 
         "optimizer": "AdamW",
         "optimizer_parameters": {
-          "lr": 1.89e-5
+          "lr": 1.89e-05
         },
         "learning_rate_drop_patience": 3,
         "learning_rate_drop_div": 1.5,
diff --git a/deeppavlov/configs/classifiers/paraphraser_convers_distilrubert_6L_torch.json b/deeppavlov/configs/classifiers/paraphraser_convers_distilrubert_6L_torch.json
index ee21189915..aab78819d2 100644
--- a/deeppavlov/configs/classifiers/paraphraser_convers_distilrubert_6L_torch.json
+++ b/deeppavlov/configs/classifiers/paraphraser_convers_distilrubert_6L_torch.json
@@ -32,7 +32,7 @@
         "hidden_keep_prob": 0.67, 
         "optimizer": "AdamW",
         "optimizer_parameters": {
-          "lr": 7.22e-5
+          "lr": 7.22e-05
         },
         "learning_rate_drop_patience": 3,
         "learning_rate_drop_div": 1.5,

From 05ae90842a064a9a9f23cd8897c6dbb99b82b27c Mon Sep 17 00:00:00 2001
From: ayeffkay <chatton1@mail.ru>
Date: Tue, 13 Jul 2021 18:38:02 +0300
Subject: [PATCH 05/17] Distil configs for rusentiment

---
 ...ntiment_convers_distilrubert_2L_torch.json | 153 ++++++++++++++++++
 ...ntiment_convers_distilrubert_6L_torch.json | 153 ++++++++++++++++++
 2 files changed, 306 insertions(+)
 create mode 100644 deeppavlov/configs/classifiers/rusentiment_convers_distilrubert_2L_torch.json
 create mode 100644 deeppavlov/configs/classifiers/rusentiment_convers_distilrubert_6L_torch.json

diff --git a/deeppavlov/configs/classifiers/rusentiment_convers_distilrubert_2L_torch.json b/deeppavlov/configs/classifiers/rusentiment_convers_distilrubert_2L_torch.json
new file mode 100644
index 0000000000..505fefb545
--- /dev/null
+++ b/deeppavlov/configs/classifiers/rusentiment_convers_distilrubert_2L_torch.json
@@ -0,0 +1,153 @@
+{
+  "dataset_reader": {
+    "class_name": "basic_classification_reader",
+    "x": "text",
+    "y": "label",
+    "data_path": "{DOWNLOADS_PATH}/rusentiment/",
+    "train": "rusentiment_random_posts.csv",
+    "test": "rusentiment_test.csv"
+  },
+  "dataset_iterator": {
+    "class_name": "basic_classification_iterator",
+    "seed": 42,
+    "split_seed": 23,
+    "field_to_split": "train",
+    "split_fields": [
+      "train",
+      "valid"
+    ],
+    "split_proportions": [
+      0.9,
+      0.1
+    ]
+  },
+  "chainer": {
+    "in": [
+      "x"
+    ],
+    "in_y": [
+      "y"
+    ],
+    "pipe": [
+      {
+        "class_name": "torch_transformers_preprocessor",
+        "vocab_file": "{TRANSFORMER}",
+        "do_lower_case": true,
+        "max_seq_length": 64,
+        "in": [
+          "x"
+        ],
+        "out": [
+          "bert_features"
+        ]
+      },
+      {
+        "id": "classes_vocab",
+        "class_name": "simple_vocab",
+        "fit_on": [
+          "y"
+        ],
+        "save_path": "{MODEL_PATH}/classes.dict",
+        "load_path": "{MODEL_PATH}/classes.dict",
+        "in": "y",
+        "out": "y_ids"
+      },
+      {
+        "in": "y_ids",
+        "out": "y_onehot",
+        "class_name": "one_hotter",
+        "depth": "#classes_vocab.len",
+        "single_vector": true
+      },
+      {
+        "class_name": "torch_transformers_classifier",
+        "n_classes": "#classes_vocab.len",
+        "return_probas": true,
+        "pretrained_bert": "{TRANSFORMER}",
+        "save_path": "{MODEL_PATH}/model",
+        "load_path": "{MODEL_PATH}/model",
+        "attention_probs_keep_prob": 0.78,
+        "hidden_keep_prob": 0.89, 
+        "optimizer": "AdamW",
+        "optimizer_parameters": {
+          "lr": 7.22e-05
+        },
+        "learning_rate_drop_patience": 5,
+        "learning_rate_drop_div": 1.5,
+        "in": [
+          "bert_features"
+        ],
+        "in_y": [
+          "y_ids"
+        ],
+        "out": [
+          "y_pred_probas"
+        ]
+      },
+      {
+        "in": "y_pred_probas",
+        "out": "y_pred_ids",
+        "class_name": "proba2labels",
+        "max_proba": true
+      },
+      {
+        "in": "y_pred_ids",
+        "out": "y_pred_labels",
+        "ref": "classes_vocab"
+      }
+    ],
+    "out": [
+      "y_pred_labels"
+    ]
+  },
+  "train": {
+    "epochs": 100,
+    "batch_size": 64,
+    "metrics": [
+        "f1_weighted",
+        "f1_macro",
+        "accuracy",
+        {
+            "name": "roc_auc",
+            "inputs": [
+                "y_onehot",
+                "y_pred_probas"
+            ]
+        }
+    ],
+    "validation_patience": 5,
+    "val_every_n_epochs": 1,
+    "log_every_n_epochs": 1,
+    "show_examples": false,
+    "evaluation_targets": [
+      "train",
+      "valid",
+      "test"
+    ],
+    "tensorboard_log_dir": "{MODEL_PATH}/",
+    "class_name": "torch_trainer"
+  },
+  "metadata": {
+    "variables": {
+      "ROOT_PATH": "~/.deeppavlov",
+      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
+      "TRANSFORMER": "DeepPavlov/distilrubert-tiny-cased-conversational",
+      "MODELS_PATH": "{ROOT_PATH}/models",
+      "MODEL_PATH": "{MODELS_PATH}/classifiers/rusentiment_convers_distilrubert_2L_torch"
+    },
+    "download": [
+      {
+        "url": "https://raw.githubusercontent.com/strawberrypie/rusentiment/master/Dataset/rusentiment_random_posts.csv",
+        "subdir": "{DOWNLOADS_PATH}/rusentiment"
+      },
+      {
+        "url": "https://raw.githubusercontent.com/strawberrypie/rusentiment/master/Dataset/rusentiment_test.csv",
+        "subdir": "{DOWNLOADS_PATH}/rusentiment"
+      },
+      {
+        "url": "http://files.deeppavlov.ai/deeppavlov_data/classifiers/rusentiment_convers_distilrubert_2L_torch.tar.gz",
+        "subdir": "{MODELS_PATH}/classifiers/"
+      }
+    ]
+  }
+} 
diff --git a/deeppavlov/configs/classifiers/rusentiment_convers_distilrubert_6L_torch.json b/deeppavlov/configs/classifiers/rusentiment_convers_distilrubert_6L_torch.json
new file mode 100644
index 0000000000..9d06ab2701
--- /dev/null
+++ b/deeppavlov/configs/classifiers/rusentiment_convers_distilrubert_6L_torch.json
@@ -0,0 +1,153 @@
+{
+  "dataset_reader": {
+    "class_name": "basic_classification_reader",
+    "x": "text",
+    "y": "label",
+    "data_path": "{DOWNLOADS_PATH}/rusentiment/",
+    "train": "rusentiment_random_posts.csv",
+    "test": "rusentiment_test.csv"
+  },
+  "dataset_iterator": {
+    "class_name": "basic_classification_iterator",
+    "seed": 42,
+    "split_seed": 23,
+    "field_to_split": "train",
+    "split_fields": [
+      "train",
+      "valid"
+    ],
+    "split_proportions": [
+      0.9,
+      0.1
+    ]
+  },
+  "chainer": {
+    "in": [
+      "x"
+    ],
+    "in_y": [
+      "y"
+    ],
+    "pipe": [
+      {
+        "class_name": "torch_transformers_preprocessor",
+        "vocab_file": "{TRANSFORMER}",
+        "do_lower_case": true,
+        "max_seq_length": 64,
+        "in": [
+          "x"
+        ],
+        "out": [
+          "bert_features"
+        ]
+      },
+      {
+        "id": "classes_vocab",
+        "class_name": "simple_vocab",
+        "fit_on": [
+          "y"
+        ],
+        "save_path": "{MODEL_PATH}/classes.dict",
+        "load_path": "{MODEL_PATH}/classes.dict",
+        "in": "y",
+        "out": "y_ids"
+      },
+      {
+        "in": "y_ids",
+        "out": "y_onehot",
+        "class_name": "one_hotter",
+        "depth": "#classes_vocab.len",
+        "single_vector": true
+      },
+      {
+        "class_name": "torch_transformers_classifier",
+        "n_classes": "#classes_vocab.len",
+        "return_probas": true,
+        "pretrained_bert": "{TRANSFORMER}",
+        "save_path": "{MODEL_PATH}/model",
+        "load_path": "{MODEL_PATH}/model",
+        "attention_probs_keep_prob": 0.78,
+        "hidden_keep_prob": 0, 
+        "optimizer": "AdamW",
+        "optimizer_parameters": {
+          "lr": 4.56e-05
+        },
+        "learning_rate_drop_patience": 5,
+        "learning_rate_drop_div": 1.5,
+        "in": [
+          "bert_features"
+        ],
+        "in_y": [
+          "y_ids"
+        ],
+        "out": [
+          "y_pred_probas"
+        ]
+      },
+      {
+        "in": "y_pred_probas",
+        "out": "y_pred_ids",
+        "class_name": "proba2labels",
+        "max_proba": true
+      },
+      {
+        "in": "y_pred_ids",
+        "out": "y_pred_labels",
+        "ref": "classes_vocab"
+      }
+    ],
+    "out": [
+      "y_pred_labels"
+    ]
+  },
+  "train": {
+    "epochs": 100,
+    "batch_size": 64,
+    "metrics": [
+        "f1_weighted",
+        "f1_macro",
+        "accuracy",
+        {
+            "name": "roc_auc",
+            "inputs": [
+                "y_onehot",
+                "y_pred_probas"
+            ]
+        }
+    ],
+    "validation_patience": 5,
+    "val_every_n_epochs": 1,
+    "log_every_n_epochs": 1,
+    "show_examples": false,
+    "evaluation_targets": [
+      "train",
+      "valid",
+      "test"
+    ],
+    "tensorboard_log_dir": "{MODEL_PATH}/",
+    "class_name": "torch_trainer"
+  },
+  "metadata": {
+    "variables": {
+      "ROOT_PATH": "~/.deeppavlov",
+      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
+      "TRANSFORMER": "DeepPavlov/distilrubert-base-cased-conversational",
+      "MODELS_PATH": "{ROOT_PATH}/models",
+      "MODEL_PATH": "{MODELS_PATH}/classifiers/rusentiment_convers_distilrubert_6L_torch"
+    },
+    "download": [
+      {
+        "url": "https://raw.githubusercontent.com/strawberrypie/rusentiment/master/Dataset/rusentiment_random_posts.csv",
+        "subdir": "{DOWNLOADS_PATH}/rusentiment"
+      },
+      {
+        "url": "https://raw.githubusercontent.com/strawberrypie/rusentiment/master/Dataset/rusentiment_test.csv",
+        "subdir": "{DOWNLOADS_PATH}/rusentiment"
+      }, 
+      {
+        "url": "http://files.deeppavlov.ai/deeppavlov_data/classifiers/rusentiment_convers_distilrubert_6L_torch.tar.gz",
+        "subdir": "{MODELS_PATH}/classifiers/"
+      }
+    ]
+  }
+} 

From 74eeedd47e5df72c1ca4100226a3dd67c5eb4a0f Mon Sep 17 00:00:00 2001
From: ayeffkay <chatton1@mail.ru>
Date: Tue, 13 Jul 2021 18:41:17 +0300
Subject: [PATCH 06/17] Distil configs for ner

---
 ...ner_rus_convers_distilrubert_2L_torch.json | 155 ++++++++++++++++++
 ...ner_rus_convers_distilrubert_6L_torch.json | 155 ++++++++++++++++++
 2 files changed, 310 insertions(+)
 create mode 100644 deeppavlov/configs/ner/ner_rus_convers_distilrubert_2L_torch.json
 create mode 100644 deeppavlov/configs/ner/ner_rus_convers_distilrubert_6L_torch.json

diff --git a/deeppavlov/configs/ner/ner_rus_convers_distilrubert_2L_torch.json b/deeppavlov/configs/ner/ner_rus_convers_distilrubert_2L_torch.json
new file mode 100644
index 0000000000..147aef5cee
--- /dev/null
+++ b/deeppavlov/configs/ner/ner_rus_convers_distilrubert_2L_torch.json
@@ -0,0 +1,155 @@
+ {
+  "dataset_reader": {
+    "class_name": "conll2003_reader",
+    "data_path": "{DOWNLOADS_PATH}/total_rus/",
+    "dataset_name": "collection_rus",
+    "provide_pos": false
+  },
+  "dataset_iterator": {
+    "class_name": "data_learning_iterator"
+  },
+  "chainer": {
+    "in": [
+      "x"
+    ],
+    "in_y": [
+      "y"
+    ],
+    "pipe": [
+      {
+        "class_name": "torch_transformers_ner_preprocessor",
+        "vocab_file": "{TRANSFORMER}",
+        "do_lower_case": false,
+        "max_seq_length": 512,
+        "max_subword_length": 15,
+        "token_masking_prob": 0.0,
+        "in": [
+          "x"
+        ],
+        "out": [
+          "x_tokens",
+          "x_subword_tokens",
+          "x_subword_tok_ids",
+          "startofword_markers",
+          "attention_mask"
+        ]
+      },
+      {
+        "id": "tag_vocab",
+        "class_name": "simple_vocab",
+        "unk_token": [
+          "O"
+        ],
+        "pad_with_zeros": true,
+        "save_path": "{MODEL_PATH}/tag.dict",
+        "load_path": "{MODEL_PATH}/tag.dict",
+        "fit_on": [
+          "y"
+        ],
+        "in": [
+          "y"
+        ],
+        "out": [
+          "y_ind"
+        ]
+      },
+      {
+        "class_name": "torch_transformers_sequence_tagger",
+        "n_tags": "#tag_vocab.len",
+        "pretrained_bert": "{TRANSFORMER}",
+        "attention_probs_keep_prob": 0.11,
+        "hidden_keep_prob": 0.11, 
+        "return_probas": false,
+        "encoder_layer_ids": [
+          -1
+        ],
+        "optimizer": "AdamW",
+        "optimizer_parameters": {
+          "lr": 5.45e-05,
+          "weight_decay": 1e-06,
+          "betas": [
+            0.9,
+            0.999
+          ],
+          "eps": 1e-06
+        },
+        "clip_norm": 1.0,
+        "min_learning_rate": 1e-07,
+        "learning_rate_drop_patience": 30,
+        "learning_rate_drop_div": 1.5,
+        "load_before_drop": true,
+        "save_path": "{MODEL_PATH}/model",
+        "load_path": "{MODEL_PATH}/model",
+        "in": [
+          "x_subword_tok_ids",
+          "attention_mask",
+          "startofword_markers"
+        ],
+        "in_y": [
+          "y_ind"
+        ],
+        "out": [
+          "y_pred_ind"
+        ]
+      },
+      {
+        "ref": "tag_vocab",
+        "in": [
+          "y_pred_ind"
+        ],
+        "out": [
+          "y_pred"
+        ]
+      }
+    ],
+    "out": [
+      "x_tokens",
+      "y_pred"
+    ]
+  },
+  "train": {
+    "epochs": 30,
+    "batch_size": 10,
+    "metrics": [
+      {
+        "name": "ner_f1",
+        "inputs": [
+          "y",
+          "y_pred"
+        ]
+      },
+      {
+        "name": "ner_token_f1",
+        "inputs": [
+          "y",
+          "y_pred"
+        ]
+      }
+    ],
+    "validation_patience": 100,
+    "val_every_n_batches": 20,
+    "log_every_n_batches": 20,
+    "show_examples": false,
+    "evaluation_targets": [
+      "valid",
+      "test"
+    ],
+    "tensorboard_log_dir": "{MODEL_PATH}/",
+    "class_name": "torch_trainer"
+  },
+  "metadata": {
+    "variables": {
+      "ROOT_PATH": "~/.deeppavlov",
+      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
+      "MODELS_PATH": "{ROOT_PATH}/models", 
+      "MODEL_PATH": "{MODELS_PATH}/ner_rus_conversational_distilrubert_2L_torch", 
+      "TRANSFORMER": "DeepPavlov/distilrubert-tiny-cased-conversational"
+    }, 
+    "download": [
+      {
+        "url": "http://files.deeppavlov.ai/v1/ner/ner_rus_conversational_distilrubert_2L_torch.tar.gz",
+        "subdir": "{MODELS_PATH}"
+      }
+    ]
+  }
+}
diff --git a/deeppavlov/configs/ner/ner_rus_convers_distilrubert_6L_torch.json b/deeppavlov/configs/ner/ner_rus_convers_distilrubert_6L_torch.json
new file mode 100644
index 0000000000..0878c393e2
--- /dev/null
+++ b/deeppavlov/configs/ner/ner_rus_convers_distilrubert_6L_torch.json
@@ -0,0 +1,155 @@
+ {
+  "dataset_reader": {
+    "class_name": "conll2003_reader",
+    "data_path": "{DOWNLOADS_PATH}/total_rus/",
+    "dataset_name": "collection_rus",
+    "provide_pos": false
+  },
+  "dataset_iterator": {
+    "class_name": "data_learning_iterator"
+  },
+  "chainer": {
+    "in": [
+      "x"
+    ],
+    "in_y": [
+      "y"
+    ],
+    "pipe": [
+      {
+        "class_name": "torch_transformers_ner_preprocessor",
+        "vocab_file": "{TRANSFORMER}",
+        "do_lower_case": false,
+        "max_seq_length": 512,
+        "max_subword_length": 15,
+        "token_masking_prob": 0.0,
+        "in": [
+          "x"
+        ],
+        "out": [
+          "x_tokens",
+          "x_subword_tokens",
+          "x_subword_tok_ids",
+          "startofword_markers",
+          "attention_mask"
+        ]
+      },
+      {
+        "id": "tag_vocab",
+        "class_name": "simple_vocab",
+        "unk_token": [
+          "O"
+        ],
+        "pad_with_zeros": true,
+        "save_path": "{MODEL_PATH}/tag.dict",
+        "load_path": "{MODEL_PATH}/tag.dict",
+        "fit_on": [
+          "y"
+        ],
+        "in": [
+          "y"
+        ],
+        "out": [
+          "y_ind"
+        ]
+      },
+      {
+        "class_name": "torch_transformers_sequence_tagger",
+        "n_tags": "#tag_vocab.len",
+        "pretrained_bert": "{TRANSFORMER}",
+        "attention_probs_keep_prob": 0.44,
+        "hidden_keep_prob": 0.89, 
+        "return_probas": false,
+        "encoder_layer_ids": [
+          -1
+        ],
+        "optimizer": "AdamW",
+        "optimizer_parameters": {
+          "lr": 2.78e-05,
+          "weight_decay": 1e-06,
+          "betas": [
+            0.9,
+            0.999
+          ],
+          "eps": 1e-06
+        },
+        "clip_norm": 1.0,
+        "min_learning_rate": 1e-07,
+        "learning_rate_drop_patience": 30,
+        "learning_rate_drop_div": 1.5,
+        "load_before_drop": true,
+        "save_path": "{MODEL_PATH}/model",
+        "load_path": "{MODEL_PATH}/model",
+        "in": [
+          "x_subword_tok_ids",
+          "attention_mask",
+          "startofword_markers"
+        ],
+        "in_y": [
+          "y_ind"
+        ],
+        "out": [
+          "y_pred_ind"
+        ]
+      },
+      {
+        "ref": "tag_vocab",
+        "in": [
+          "y_pred_ind"
+        ],
+        "out": [
+          "y_pred"
+        ]
+      }
+    ],
+    "out": [
+      "x_tokens",
+      "y_pred"
+    ]
+  },
+  "train": {
+    "epochs": 30,
+    "batch_size": 10,
+    "metrics": [
+      {
+        "name": "ner_f1",
+        "inputs": [
+          "y",
+          "y_pred"
+        ]
+      },
+      {
+        "name": "ner_token_f1",
+        "inputs": [
+          "y",
+          "y_pred"
+        ]
+      }
+    ],
+    "validation_patience": 100,
+    "val_every_n_batches": 20,
+    "log_every_n_batches": 20,
+    "show_examples": false,
+    "evaluation_targets": [
+      "valid",
+      "test"
+    ],
+    "tensorboard_log_dir": "{MODEL_PATH}/",
+    "class_name": "torch_trainer"
+  },
+  "metadata": {
+    "variables": {
+      "ROOT_PATH": "~/.deeppavlov",
+      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
+      "MODELS_PATH": "{ROOT_PATH}/models", 
+      "MODEL_PATH": "{MODELS_PATH}/ner_rus_conversational_distilrubert_6L_torch", 
+      "TRANSFORMER": "DeepPavlov/distilrubert-base-cased-conversational"
+    }, 
+    "download": [
+      {
+        "url": "http://files.deeppavlov.ai/v1/ner/ner_rus_conversational_distilrubert_6L_torch.tar.gz",
+        "subdir": "{MODELS_PATH}"
+      }
+    ]
+  }
+}

From 5ae20e18936527c6b4e14c096ea84823981e1d75 Mon Sep 17 00:00:00 2001
From: ayeffkay <chatton1@mail.ru>
Date: Tue, 13 Jul 2021 18:59:00 +0300
Subject: [PATCH 07/17] Distil squad configs added

---
 ...quad_ru_convers_distilrubert_2L_torch.json | 173 ++++++++++++++++++
 ...u_convers_distilrubert_2L_torch_infer.json |  76 ++++++++
 ...quad_ru_convers_distilrubert_6L_torch.json | 173 ++++++++++++++++++
 ...u_convers_distilrubert_6L_torch_infer.json |  76 ++++++++
 4 files changed, 498 insertions(+)
 create mode 100644 deeppavlov/configs/squad/squad_ru_convers_distilrubert_2L_torch.json
 create mode 100644 deeppavlov/configs/squad/squad_ru_convers_distilrubert_2L_torch_infer.json
 create mode 100644 deeppavlov/configs/squad/squad_ru_convers_distilrubert_6L_torch.json
 create mode 100644 deeppavlov/configs/squad/squad_ru_convers_distilrubert_6L_torch_infer.json

diff --git a/deeppavlov/configs/squad/squad_ru_convers_distilrubert_2L_torch.json b/deeppavlov/configs/squad/squad_ru_convers_distilrubert_2L_torch.json
new file mode 100644
index 0000000000..162a8f7013
--- /dev/null
+++ b/deeppavlov/configs/squad/squad_ru_convers_distilrubert_2L_torch.json
@@ -0,0 +1,173 @@
+{
+  "dataset_reader": {
+    "class_name": "squad_dataset_reader",
+    "dataset": "SberSQuADClean",
+    "url": "http://files.deeppavlov.ai/datasets/sber_squad_clean-v1.1.tar.gz",
+    "data_path": "{DOWNLOADS_PATH}/squad_ru_clean/"
+  },
+  "dataset_iterator": {
+    "class_name": "squad_iterator",
+    "seed": 1337,
+    "shuffle": true
+  },
+  "chainer": {
+    "in": [
+      "context_raw",
+      "question_raw"
+    ],
+    "in_y": [
+      "ans_raw",
+      "ans_raw_start"
+    ],
+    "pipe": [
+      {
+        "class_name": "torch_squad_transformers_preprocessor",
+        "add_token_type_ids": true, 
+        "vocab_file": "{TRANSFORMER}",
+        "do_lower_case": "{lowercase}",
+        "max_seq_length": 384,
+        "return_tokens": true,
+        "in": [
+          "question_raw",
+          "context_raw"
+        ],
+        "out": [
+          "bert_features",
+          "subtokens"
+        ]
+      },
+      {
+        "class_name": "squad_bert_mapping",
+        "do_lower_case": "{lowercase}",
+        "in": [
+          "context_raw",
+          "bert_features",
+          "subtokens"
+        ],
+        "out": [
+          "subtok2chars",
+          "char2subtoks"
+        ]
+      },
+      {
+        "class_name": "squad_bert_ans_preprocessor",
+        "do_lower_case": "{lowercase}",
+        "in": [
+          "ans_raw",
+          "ans_raw_start",
+          "char2subtoks"
+        ],
+        "out": [
+          "ans",
+          "ans_start",
+          "ans_end"
+        ]
+      },
+      {
+        "class_name": "torch_transformers_squad",
+        "pretrained_bert": "{TRANSFORMER}",
+        "save_path": "{MODEL_PATH}/model",
+        "load_path": "{MODEL_PATH}/model",
+        "attention_probs_keep_prob": 0.11,
+        "hidden_keep_prob": 0.33, 
+        "optimizer": "AdamW",
+        "optimizer_parameters": {
+          "lr": 9e-05
+        },
+        "learning_rate_drop_patience": 2,
+        "learning_rate_drop_div": 1.5,
+        "in": [
+          "bert_features"
+        ],
+        "in_y": [
+          "ans_start",
+          "ans_end"
+        ],
+        "out": [
+          "ans_start_predicted",
+          "ans_end_predicted",
+          "logits"
+        ]
+      },
+      {
+        "class_name": "squad_bert_ans_postprocessor",
+        "in": [
+          "ans_start_predicted",
+          "ans_end_predicted",
+          "context_raw",
+          "bert_features",
+          "subtok2chars",
+          "subtokens"
+        ],
+        "out": [
+          "ans_predicted",
+          "ans_start_predicted",
+          "ans_end_predicted"
+        ]
+      }
+    ],
+    "out": [
+      "ans_predicted",
+      "ans_start_predicted",
+      "logits"
+    ]
+  },
+  "train": {
+    "show_examples": false,
+    "evaluation_targets": [
+      "valid"
+    ],
+    "log_every_n_batches": 250,
+    "val_every_n_batches": 500,
+    "batch_size": 10,
+    "validation_patience": 10,
+    "metrics": [
+      {
+        "name": "squad_v2_f1",
+        "inputs": [
+          "ans",
+          "ans_predicted"
+        ]
+      },
+      {
+        "name": "squad_v2_em",
+        "inputs": [
+          "ans",
+          "ans_predicted"
+        ]
+      },
+      {
+        "name": "squad_v1_f1",
+        "inputs": [
+          "ans",
+          "ans_predicted"
+        ]
+      },
+      {
+        "name": "squad_v1_em",
+        "inputs": [
+          "ans",
+          "ans_predicted"
+        ]
+      }
+    ],
+    "tensorboard_log_dir": "{MODEL_PATH}/logs",
+    "class_name": "torch_trainer"
+  },
+  "metadata": {
+    "variables": {
+      "lowercase": false, 
+      "ROOT_PATH": "~/.deeppavlov",
+      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
+      "TRANSFORMER": "DeepPavlov/distilrubert-tiny-cased-conversational",
+      "MODELS_PATH": "{ROOT_PATH}/models",
+      "MODEL_PATH": "{MODELS_PATH}/squad_ru_convers_distilrubert_2L_torch"
+    },
+    "download": [
+      {
+        "url": "http://files.deeppavlov.ai/deeppavlov_data/squad_ru_convers_distilrubert_2L_torch.tar.gz",
+        "subdir": "{MODELS_PATH}"
+      }
+    ]
+  }
+} 
diff --git a/deeppavlov/configs/squad/squad_ru_convers_distilrubert_2L_torch_infer.json b/deeppavlov/configs/squad/squad_ru_convers_distilrubert_2L_torch_infer.json
new file mode 100644
index 0000000000..b67331a2fc
--- /dev/null
+++ b/deeppavlov/configs/squad/squad_ru_convers_distilrubert_2L_torch_infer.json
@@ -0,0 +1,76 @@
+{
+  "dataset_reader": {
+    "class_name": "squad_dataset_reader",
+    "dataset": "SberSQuADClean",
+    "url": "http://files.deeppavlov.ai/datasets/sber_squad_clean-v1.1.tar.gz",
+    "data_path": "{DOWNLOADS_PATH}/squad_ru_clean/"
+  },
+  "dataset_iterator": {
+    "class_name": "squad_iterator",
+    "seed": 1337,
+    "shuffle": true
+  },
+  "chainer": {
+    "in": ["context_raw", "question_raw"],
+    "in_y": ["ans_raw", "ans_raw_start"],
+    "pipe": [
+        {
+        "class_name": "torch_transformers_squad_infer",
+        "lang": "ru", 
+        "batch_size": 128,
+        "squad_model_config": "{CONFIGS_PATH}/squad/squad_ru_convers_distilrubert_2L_torch.json",
+        "vocab_file": "{TRANSFORMER}",
+        "do_lower_case": "{lowercase}",
+        "max_seq_length": 256,
+        "in": ["context_raw", "question_raw"],
+        "out": ["ans_predicted", "ans_start_predicted", "logits"]
+        }
+    ],
+    "out": ["ans_predicted", "ans_start_predicted", "logits"]
+  },
+  "train": {
+    "show_examples": false,
+    "evaluation_targets": [
+      "valid"
+    ],
+    "log_every_n_batches": 250,
+    "val_every_n_batches": 500,
+    "batch_size": 10,
+    "validation_patience": 10,
+    "metrics": [
+      {
+        "name": "squad_v2_f1",
+        "inputs": ["ans_raw", "ans_predicted"]
+      },
+      {
+        "name": "squad_v2_em",
+        "inputs": ["ans_raw", "ans_predicted"]
+      },
+      {
+        "name": "squad_v1_f1",
+        "inputs": ["ans_raw", "ans_predicted"]
+      },
+      {
+        "name": "squad_v1_em",
+        "inputs": ["ans_raw", "ans_predicted"]
+      }
+    ]
+  },
+  "metadata": {
+    "variables": {
+      "lowercase": false, 
+      "ROOT_PATH": "~/.deeppavlov",
+      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
+      "TRANSFORMER": "DeepPavlov/distilrubert-tiny-cased-conversational",
+      "MODELS_PATH": "{ROOT_PATH}/models",
+      "MODEL_PATH": "{MODELS_PATH}/squad_ru_convers_distilrubert_2L_torch",
+      "CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs"
+    },
+    "download": [
+      {
+        "url": "http://files.deeppavlov.ai/deeppavlov_data/squad_ru_convers_distilrubert_2L_torch.tar.gz",
+        "subdir": "{MODELS_PATH}"
+      }
+    ]
+  }
+}
diff --git a/deeppavlov/configs/squad/squad_ru_convers_distilrubert_6L_torch.json b/deeppavlov/configs/squad/squad_ru_convers_distilrubert_6L_torch.json
new file mode 100644
index 0000000000..574702120c
--- /dev/null
+++ b/deeppavlov/configs/squad/squad_ru_convers_distilrubert_6L_torch.json
@@ -0,0 +1,173 @@
+{
+  "dataset_reader": {
+    "class_name": "squad_dataset_reader",
+    "dataset": "SberSQuADClean",
+    "url": "http://files.deeppavlov.ai/datasets/sber_squad_clean-v1.1.tar.gz",
+    "data_path": "{DOWNLOADS_PATH}/squad_ru_clean/"
+  },
+  "dataset_iterator": {
+    "class_name": "squad_iterator",
+    "seed": 1337,
+    "shuffle": true
+  },
+  "chainer": {
+    "in": [
+      "context_raw",
+      "question_raw"
+    ],
+    "in_y": [
+      "ans_raw",
+      "ans_raw_start"
+    ],
+    "pipe": [
+      {
+        "class_name": "torch_squad_transformers_preprocessor", 
+        "add_token_type_ids": true, 
+        "vocab_file": "{TRANSFORMER}",
+        "do_lower_case": "{lowercase}",
+        "max_seq_length": 384,
+        "return_tokens": true,
+        "in": [
+          "question_raw",
+          "context_raw"
+        ],
+        "out": [
+          "bert_features",
+          "subtokens"
+        ]
+      },
+      {
+        "class_name": "squad_bert_mapping",
+        "do_lower_case": "{lowercase}",
+        "in": [
+          "context_raw",
+          "bert_features",
+          "subtokens"
+        ],
+        "out": [
+          "subtok2chars",
+          "char2subtoks"
+        ]
+      },
+      {
+        "class_name": "squad_bert_ans_preprocessor",
+        "do_lower_case": "{lowercase}",
+        "in": [
+          "ans_raw",
+          "ans_raw_start",
+          "char2subtoks"
+        ],
+        "out": [
+          "ans",
+          "ans_start",
+          "ans_end"
+        ]
+      },
+      {
+        "class_name": "torch_transformers_squad",
+        "pretrained_bert": "{TRANSFORMER}",
+        "save_path": "{MODEL_PATH}/model",
+        "load_path": "{MODEL_PATH}/model",
+        "attention_probs_keep_prob": 0.0,
+        "hidden_keep_prob": 0.33, 
+        "optimizer": "AdamW",
+        "optimizer_parameters": {
+          "lr": 3.67e-5
+        },
+        "learning_rate_drop_patience": 2,
+        "learning_rate_drop_div": 1.5,
+        "in": [
+          "bert_features"
+        ],
+        "in_y": [
+          "ans_start",
+          "ans_end"
+        ],
+        "out": [
+          "ans_start_predicted",
+          "ans_end_predicted",
+          "logits"
+        ]
+      },
+      {
+        "class_name": "squad_bert_ans_postprocessor",
+        "in": [
+          "ans_start_predicted",
+          "ans_end_predicted",
+          "context_raw",
+          "bert_features",
+          "subtok2chars",
+          "subtokens"
+        ],
+        "out": [
+          "ans_predicted",
+          "ans_start_predicted",
+          "ans_end_predicted"
+        ]
+      }
+    ],
+    "out": [
+      "ans_predicted",
+      "ans_start_predicted",
+      "logits"
+    ]
+  },
+  "train": {
+    "show_examples": false,
+    "evaluation_targets": [
+      "valid"
+    ],
+    "log_every_n_batches": 250,
+    "val_every_n_batches": 500,
+    "batch_size": 10,
+    "validation_patience": 10,
+    "metrics": [
+      {
+        "name": "squad_v2_f1",
+        "inputs": [
+          "ans",
+          "ans_predicted"
+        ]
+      },
+      {
+        "name": "squad_v2_em",
+        "inputs": [
+          "ans",
+          "ans_predicted"
+        ]
+      },
+      {
+        "name": "squad_v1_f1",
+        "inputs": [
+          "ans",
+          "ans_predicted"
+        ]
+      },
+      {
+        "name": "squad_v1_em",
+        "inputs": [
+          "ans",
+          "ans_predicted"
+        ]
+      }
+    ],
+    "tensorboard_log_dir": "{MODEL_PATH}/logs",
+    "class_name": "torch_trainer"
+  },
+  "metadata": {
+    "variables": {
+      "lowercase": false, 
+      "ROOT_PATH": "~/.deeppavlov",
+      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
+      "TRANSFORMER": "DeepPavlov/distilrubert-base-cased-conversational",
+      "MODELS_PATH": "{ROOT_PATH}/models",
+      "MODEL_PATH": "{MODELS_PATH}/squad_ru_convers_distilrubert_6L_torch"
+    },
+    "download": [
+      {
+        "url": "http://files.deeppavlov.ai/deeppavlov_data/squad_ru_convers_distilrubert_6L_torch.tar.gz",
+        "subdir": "{MODELS_PATH}"
+      }
+    ]
+  }
+} 
diff --git a/deeppavlov/configs/squad/squad_ru_convers_distilrubert_6L_torch_infer.json b/deeppavlov/configs/squad/squad_ru_convers_distilrubert_6L_torch_infer.json
new file mode 100644
index 0000000000..dbe16c055d
--- /dev/null
+++ b/deeppavlov/configs/squad/squad_ru_convers_distilrubert_6L_torch_infer.json
@@ -0,0 +1,76 @@
+{
+  "dataset_reader": {
+    "class_name": "squad_dataset_reader",
+    "dataset": "SberSQuADClean",
+    "url": "http://files.deeppavlov.ai/datasets/sber_squad_clean-v1.1.tar.gz",
+    "data_path": "{DOWNLOADS_PATH}/squad_ru_clean/"
+  },
+  "dataset_iterator": {
+    "class_name": "squad_iterator",
+    "seed": 1337,
+    "shuffle": true
+  },
+  "chainer": {
+    "in": ["context_raw", "question_raw"],
+    "in_y": ["ans_raw", "ans_raw_start"],
+    "pipe": [
+        {
+        "class_name": "torch_transformers_squad_infer",
+        "lang": "ru", 
+        "batch_size": 128,
+        "squad_model_config": "{CONFIGS_PATH}/squad/squad_ru_convers_distilrubert_6L_torch.json",
+        "vocab_file": "{TRANSFORMER}",
+        "do_lower_case": "{lowercase}",
+        "max_seq_length": 256,
+        "in": ["context_raw", "question_raw"],
+        "out": ["ans_predicted", "ans_start_predicted", "logits"]
+        }
+    ],
+    "out": ["ans_predicted", "ans_start_predicted", "logits"]
+  },
+  "train": {
+    "show_examples": false,
+    "evaluation_targets": [
+      "valid"
+    ],
+    "log_every_n_batches": 250,
+    "val_every_n_batches": 500,
+    "batch_size": 10,
+    "validation_patience": 10,
+    "metrics": [
+      {
+        "name": "squad_v2_f1",
+        "inputs": ["ans_raw", "ans_predicted"]
+      },
+      {
+        "name": "squad_v2_em",
+        "inputs": ["ans_raw", "ans_predicted"]
+      },
+      {
+        "name": "squad_v1_f1",
+        "inputs": ["ans_raw", "ans_predicted"]
+      },
+      {
+        "name": "squad_v1_em",
+        "inputs": ["ans_raw", "ans_predicted"]
+      }
+    ]
+  },
+  "metadata": {
+    "variables": {
+      "lowercase": false, 
+      "ROOT_PATH": "~/.deeppavlov",
+      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
+      "TRANSFORMER": "DeepPavlov/distilrubert-base-cased-conversational",
+      "MODELS_PATH": "{ROOT_PATH}/models",
+      "MODEL_PATH": "{MODELS_PATH}/squad_ru_convers_distilrubert_6L_torch",
+      "CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs"
+    },
+    "download": [
+      {
+        "url": "http://files.deeppavlov.ai/deeppavlov_data/squad_ru_convers_distilrubert_6L_torch.tar.gz",
+        "subdir": "{MODELS_PATH}"
+      }
+    ]
+  }
+}

From b3dc15c133808f38eef67d53c764c9c309d6dbd4 Mon Sep 17 00:00:00 2001
From: Fedor Ignatov <ignatov.fedor@gmail.com>
Date: Fri, 30 Jul 2021 14:49:37 +0300
Subject: [PATCH 08/17] feat: paraphraser_convers_distilrubert_2L_torch removed
 torch

---
 ...2L_torch.json => paraphraser_convers_distilrubert_2L.json} | 4 ++--
 docs/features/overview.rst                                    | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)
 rename deeppavlov/configs/classifiers/{paraphraser_convers_distilrubert_2L_torch.json => paraphraser_convers_distilrubert_2L.json} (96%)

diff --git a/deeppavlov/configs/classifiers/paraphraser_convers_distilrubert_2L_torch.json b/deeppavlov/configs/classifiers/paraphraser_convers_distilrubert_2L.json
similarity index 96%
rename from deeppavlov/configs/classifiers/paraphraser_convers_distilrubert_2L_torch.json
rename to deeppavlov/configs/classifiers/paraphraser_convers_distilrubert_2L.json
index aabfa9d7b5..5dcc8460b5 100644
--- a/deeppavlov/configs/classifiers/paraphraser_convers_distilrubert_2L_torch.json
+++ b/deeppavlov/configs/classifiers/paraphraser_convers_distilrubert_2L.json
@@ -73,11 +73,11 @@
       "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
       "TRANSFORMER": "DeepPavlov/distilrubert-tiny-cased-conversational",
       "MODELS_PATH": "{ROOT_PATH}/models",
-      "MODEL_PATH": "{MODELS_PATH}/paraphraser_convers_distilrubert_2L_torch"
+      "MODEL_PATH": "{MODELS_PATH}/paraphraser_convers_distilrubert_2L"
     },
     "download": [
         {
-        "url": "http://files.deeppavlov.ai/deeppavlov_data/classifiers/paraphraser_convers_distilrubert_2L_torch.tar.gz",
+        "url": "http://files.deeppavlov.ai/deeppavlov_data/classifiers/paraphraser_convers_distilrubert_2L.tar.gz",
         "subdir": "{MODELS_PATH}"
         }
     ]
diff --git a/docs/features/overview.rst b/docs/features/overview.rst
index 72f58015c3..3503cf36cd 100644
--- a/docs/features/overview.rst
+++ b/docs/features/overview.rst
@@ -264,9 +264,9 @@ Available pre-trained models for paraphrase identification:
    +------------------------+------------------------------------------------------------------------------------------------------------------+----------------+-----------------+------------+------------+----------------+-----------------+-----------+
    | `paraphraser.ru`_      | :config:`paraphrase_rubert <classifiers/paraphraser_rubert.json>`                                                |   90.2         |   84.9          |   92.3     |  87.9      |   --           |   --            | 1325M     |
    +------------------------+------------------------------------------------------------------------------------------------------------------+----------------+-----------------+------------+------------+----------------+-----------------+-----------+
-   | `paraphraser.ru`_      | :config:`paraphraser_convers_distilrubert_2L_torch <classifiers/paraphraser_convers_distilrubert_2L_torch.json>` |  76.1 ± 0.2    |  64.5 ± 0.5     | 81.8 ± 0.2 | 73.9 ± 0.8 |   --           |   --            | 618M      |
+   | `paraphraser.ru`_      | :config:`paraphraser_convers_distilrubert_2L <classifiers/paraphraser_convers_distilrubert_2L.json>` |  76.1 ± 0.2    |  64.5 ± 0.5     | 81.8 ± 0.2 | 73.9 ± 0.8 |   --           |   --            | 618M      |
    +------------------------+------------------------------------------------------------------------------------------------------------------+----------------+-----------------+------------+------------+----------------+-----------------+-----------+
-   | `paraphraser.ru`_      | :config:`paraphraser_convers_distilrubert_2L_torch <classifiers/paraphraser_convers_distilrubert_6L_torch.json>` |  86.5 ± 0.5    |  78.9 ± 0.4     | 89.6 ± 0.3 | 83.2 ± 0.5 |   --           |   --            | 930M      |
+   | `paraphraser.ru`_      | :config:`paraphraser_convers_distilrubert_6L_torch <classifiers/paraphraser_convers_distilrubert_6L_torch.json>` |  86.5 ± 0.5    |  78.9 ± 0.4     | 89.6 ± 0.3 | 83.2 ± 0.5 |   --           |   --            | 930M      |
    +------------------------+------------------------------------------------------------------------------------------------------------------+----------------+-----------------+------------+------------+----------------+-----------------+-----------+
 
 .. _`paraphraser.ru`: https://paraphraser.ru/

From 54830d134b0ed78b12b76ec398f2787a3552f03f Mon Sep 17 00:00:00 2001
From: Fedor Ignatov <ignatov.fedor@gmail.com>
Date: Fri, 30 Jul 2021 15:11:49 +0300
Subject: [PATCH 09/17] feat: added test paraphraser_convers_distilrubert_2L

---
 ...6L_torch.json => paraphraser_convers_distilrubert_6L.json} | 4 ++--
 tests/test_quick_start.py                                     | 3 +++
 2 files changed, 5 insertions(+), 2 deletions(-)
 rename deeppavlov/configs/classifiers/{paraphraser_convers_distilrubert_6L_torch.json => paraphraser_convers_distilrubert_6L.json} (96%)

diff --git a/deeppavlov/configs/classifiers/paraphraser_convers_distilrubert_6L_torch.json b/deeppavlov/configs/classifiers/paraphraser_convers_distilrubert_6L.json
similarity index 96%
rename from deeppavlov/configs/classifiers/paraphraser_convers_distilrubert_6L_torch.json
rename to deeppavlov/configs/classifiers/paraphraser_convers_distilrubert_6L.json
index aab78819d2..f8a9e96921 100644
--- a/deeppavlov/configs/classifiers/paraphraser_convers_distilrubert_6L_torch.json
+++ b/deeppavlov/configs/classifiers/paraphraser_convers_distilrubert_6L.json
@@ -73,11 +73,11 @@
       "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
       "TRANSFORMER": "DeepPavlov/distilrubert-base-cased-conversational",
       "MODELS_PATH": "{ROOT_PATH}/models",
-      "MODEL_PATH": "{MODELS_PATH}/paraphraser_convers_distilrubert_6L_torch"
+      "MODEL_PATH": "{MODELS_PATH}/paraphraser_convers_distilrubert_6L"
     },
     "download": [
         {
-        "url": "http://files.deeppavlov.ai/deeppavlov_data/classifiers/paraphraser_convers_distilrubert_6L_torch.tar.gz",
+        "url": "http://files.deeppavlov.ai/deeppavlov_data/classifiers/paraphraser_convers_distilrubert_6L.tar.gz",
         "subdir": "{MODELS_PATH}"
         }
     ]
diff --git a/tests/test_quick_start.py b/tests/test_quick_start.py
index 9c7b642364..7f16fefadc 100644
--- a/tests/test_quick_start.py
+++ b/tests/test_quick_start.py
@@ -144,6 +144,9 @@
         ("classifiers/intents_sample_csv.json", "classifiers", ('TI',)): [ONE_ARGUMENT_INFER_CHECK],
         ("classifiers/intents_sample_json.json", "classifiers", ('TI',)): [ONE_ARGUMENT_INFER_CHECK]
     },
+    "distil": {
+        ("classifiers/paraphraser_convers_distilrubert_2L.json", "distil", ('IP')): [TWO_ARGUMENTS_INFER_CHECK]
+    },
     "entity_linking": {
         ("kbqa/entity_linking_rus.json", "entity_linking",  ('IP',)):
             [

From d79aca0cdb888049c45d1918b36af1d976d9e447 Mon Sep 17 00:00:00 2001
From: Fedor Ignatov <ignatov.fedor@gmail.com>
Date: Fri, 30 Jul 2021 15:12:37 +0300
Subject: [PATCH 10/17] paraphraser_convers_distilrubert_6L renamed

---
 docs/features/overview.rst | 2 +-
 tests/test_quick_start.py  | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/docs/features/overview.rst b/docs/features/overview.rst
index 3503cf36cd..a8b50c10fe 100644
--- a/docs/features/overview.rst
+++ b/docs/features/overview.rst
@@ -266,7 +266,7 @@ Available pre-trained models for paraphrase identification:
    +------------------------+------------------------------------------------------------------------------------------------------------------+----------------+-----------------+------------+------------+----------------+-----------------+-----------+
    | `paraphraser.ru`_      | :config:`paraphraser_convers_distilrubert_2L <classifiers/paraphraser_convers_distilrubert_2L.json>` |  76.1 ± 0.2    |  64.5 ± 0.5     | 81.8 ± 0.2 | 73.9 ± 0.8 |   --           |   --            | 618M      |
    +------------------------+------------------------------------------------------------------------------------------------------------------+----------------+-----------------+------------+------------+----------------+-----------------+-----------+
-   | `paraphraser.ru`_      | :config:`paraphraser_convers_distilrubert_6L_torch <classifiers/paraphraser_convers_distilrubert_6L_torch.json>` |  86.5 ± 0.5    |  78.9 ± 0.4     | 89.6 ± 0.3 | 83.2 ± 0.5 |   --           |   --            | 930M      |
+   | `paraphraser.ru`_      | :config:`paraphraser_convers_distilrubert_6L <classifiers/paraphraser_convers_distilrubert_6L.json>` |  86.5 ± 0.5    |  78.9 ± 0.4     | 89.6 ± 0.3 | 83.2 ± 0.5 |   --           |   --            | 930M      |
    +------------------------+------------------------------------------------------------------------------------------------------------------+----------------+-----------------+------------+------------+----------------+-----------------+-----------+
 
 .. _`paraphraser.ru`: https://paraphraser.ru/
diff --git a/tests/test_quick_start.py b/tests/test_quick_start.py
index 7f16fefadc..7b0dea2d9a 100644
--- a/tests/test_quick_start.py
+++ b/tests/test_quick_start.py
@@ -145,7 +145,8 @@
         ("classifiers/intents_sample_json.json", "classifiers", ('TI',)): [ONE_ARGUMENT_INFER_CHECK]
     },
     "distil": {
-        ("classifiers/paraphraser_convers_distilrubert_2L.json", "distil", ('IP')): [TWO_ARGUMENTS_INFER_CHECK]
+        ("classifiers/paraphraser_convers_distilrubert_2L.json", "distil", ('IP')): [TWO_ARGUMENTS_INFER_CHECK],
+        ("classifiers/paraphraser_convers_distilrubert_6L.json", "distil", ('IP')): [TWO_ARGUMENTS_INFER_CHECK]
     },
     "entity_linking": {
         ("kbqa/entity_linking_rus.json", "entity_linking",  ('IP',)):

From 7c120a5145b0327c5c5e65d284bcb28c4b72adb7 Mon Sep 17 00:00:00 2001
From: Fedor Ignatov <ignatov.fedor@gmail.com>
Date: Fri, 30 Jul 2021 15:24:06 +0300
Subject: [PATCH 11/17] fix html build

---
 docs/features/overview.rst | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/docs/features/overview.rst b/docs/features/overview.rst
index a8b50c10fe..ab2015ed37 100644
--- a/docs/features/overview.rst
+++ b/docs/features/overview.rst
@@ -255,19 +255,19 @@ Available pre-trained models for paraphrase identification:
 .. table::
    :widths: auto
 
-   +------------------------+------------------------------------------------------------------------------------------------------------------+----------------+-----------------+------------+------------+----------------+-----------------+-----------+
-   |    Dataset             | Model config                                                                                                     | Val (accuracy) | Test (accuracy) | Val (F1)   | Test (F1)  | Val (log_loss) | Test (log_loss) | Downloads |
-   +========================+==================================================================================================================+================+=================+============+============+================+=================+===========+
-   | `paraphraser.ru`_      | :config:`paraphrase_ident_paraphraser_ft <ranking/paraphrase_ident_paraphraser_interact.json>`                   |   83.8         |   75.4          |   87.9     |  80.9      |   0.468        |   0.616         | 5938M     |
-   +------------------------+------------------------------------------------------------------------------------------------------------------+----------------+-----------------+------------+------------+----------------+-----------------+-----------+
-   | `paraphraser.ru`_      | :config:`paraphrase_bert_multilingual <classifiers/paraphraser_bert.json>`                                       |   87.4         |   79.3          |   90.2     |  83.4      |   --           |   --            | 1330M     |
-   +------------------------+------------------------------------------------------------------------------------------------------------------+----------------+-----------------+------------+------------+----------------+-----------------+-----------+
-   | `paraphraser.ru`_      | :config:`paraphrase_rubert <classifiers/paraphraser_rubert.json>`                                                |   90.2         |   84.9          |   92.3     |  87.9      |   --           |   --            | 1325M     |
-   +------------------------+------------------------------------------------------------------------------------------------------------------+----------------+-----------------+------------+------------+----------------+-----------------+-----------+
+   +------------------------+------------------------------------------------------------------------------------------------------+----------------+-----------------+------------+------------+----------------+-----------------+-----------+
+   |    Dataset             | Model config                                                                                         | Val (accuracy) | Test (accuracy) | Val (F1)   | Test (F1)  | Val (log_loss) | Test (log_loss) | Downloads |
+   +========================+======================================================================================================+================+=================+============+============+================+=================+===========+
+   | `paraphraser.ru`_      | :config:`paraphrase_ident_paraphraser_ft <ranking/paraphrase_ident_paraphraser_interact.json>`       |   83.8         |   75.4          |   87.9     |  80.9      |   0.468        |   0.616         | 5938M     |
+   +------------------------+------------------------------------------------------------------------------------------------------+----------------+-----------------+------------+------------+----------------+-----------------+-----------+
+   | `paraphraser.ru`_      | :config:`paraphrase_bert_multilingual <classifiers/paraphraser_bert.json>`                           |   87.4         |   79.3          |   90.2     |  83.4      |   --           |   --            | 1330M     |
+   +------------------------+------------------------------------------------------------------------------------------------------+----------------+-----------------+------------+------------+----------------+-----------------+-----------+
+   | `paraphraser.ru`_      | :config:`paraphrase_rubert <classifiers/paraphraser_rubert.json>`                                    |   90.2         |   84.9          |   92.3     |  87.9      |   --           |   --            | 1325M     |
+   +------------------------+------------------------------------------------------------------------------------------------------+----------------+-----------------+------------+------------+----------------+-----------------+-----------+
    | `paraphraser.ru`_      | :config:`paraphraser_convers_distilrubert_2L <classifiers/paraphraser_convers_distilrubert_2L.json>` |  76.1 ± 0.2    |  64.5 ± 0.5     | 81.8 ± 0.2 | 73.9 ± 0.8 |   --           |   --            | 618M      |
-   +------------------------+------------------------------------------------------------------------------------------------------------------+----------------+-----------------+------------+------------+----------------+-----------------+-----------+
+   +------------------------+------------------------------------------------------------------------------------------------------+----------------+-----------------+------------+------------+----------------+-----------------+-----------+
    | `paraphraser.ru`_      | :config:`paraphraser_convers_distilrubert_6L <classifiers/paraphraser_convers_distilrubert_6L.json>` |  86.5 ± 0.5    |  78.9 ± 0.4     | 89.6 ± 0.3 | 83.2 ± 0.5 |   --           |   --            | 930M      |
-   +------------------------+------------------------------------------------------------------------------------------------------------------+----------------+-----------------+------------+------------+----------------+-----------------+-----------+
+   +------------------------+------------------------------------------------------------------------------------------------------+----------------+-----------------+------------+------------+----------------+-----------------+-----------+
 
 .. _`paraphraser.ru`: https://paraphraser.ru/
 

From d8f53e5b26c580446a49e3c482a1f93cc031b9a7 Mon Sep 17 00:00:00 2001
From: Fedor Ignatov <ignatov.fedor@gmail.com>
Date: Fri, 30 Jul 2021 18:30:17 +0300
Subject: [PATCH 12/17] removed torch from names

---
 ... rusentiment_convers_distilrubert_2L.json} |   4 +-
 ... rusentiment_convers_distilrubert_6L.json} |   4 +-
 ...n => ner_rus_convers_distilrubert_2L.json} |   4 +-
 ...n => ner_rus_convers_distilrubert_6L.json} |   4 +-
 ... => squad_ru_convers_distilrubert_2L.json} |   4 +-
 ...uad_ru_convers_distilrubert_2L_infer.json} |   6 +-
 ... => squad_ru_convers_distilrubert_6L.json} |   4 +-
 ...uad_ru_convers_distilrubert_6L_infer.json} |   6 +-
 docs/features/overview.rst                    | 216 +++++++++---------
 tests/test_quick_start.py                     |  10 +-
 10 files changed, 135 insertions(+), 127 deletions(-)
 rename deeppavlov/configs/classifiers/{rusentiment_convers_distilrubert_2L_torch.json => rusentiment_convers_distilrubert_2L.json} (97%)
 rename deeppavlov/configs/classifiers/{rusentiment_convers_distilrubert_6L_torch.json => rusentiment_convers_distilrubert_6L.json} (97%)
 rename deeppavlov/configs/ner/{ner_rus_convers_distilrubert_2L_torch.json => ner_rus_convers_distilrubert_2L.json} (98%)
 rename deeppavlov/configs/ner/{ner_rus_convers_distilrubert_6L_torch.json => ner_rus_convers_distilrubert_6L.json} (98%)
 rename deeppavlov/configs/squad/{squad_ru_convers_distilrubert_2L_torch.json => squad_ru_convers_distilrubert_2L.json} (98%)
 rename deeppavlov/configs/squad/{squad_ru_convers_distilrubert_2L_torch_infer.json => squad_ru_convers_distilrubert_2L_infer.json} (95%)
 rename deeppavlov/configs/squad/{squad_ru_convers_distilrubert_6L_torch.json => squad_ru_convers_distilrubert_6L.json} (98%)
 rename deeppavlov/configs/squad/{squad_ru_convers_distilrubert_6L_torch_infer.json => squad_ru_convers_distilrubert_6L_infer.json} (95%)

diff --git a/deeppavlov/configs/classifiers/rusentiment_convers_distilrubert_2L_torch.json b/deeppavlov/configs/classifiers/rusentiment_convers_distilrubert_2L.json
similarity index 97%
rename from deeppavlov/configs/classifiers/rusentiment_convers_distilrubert_2L_torch.json
rename to deeppavlov/configs/classifiers/rusentiment_convers_distilrubert_2L.json
index 505fefb545..ff6c6cdabc 100644
--- a/deeppavlov/configs/classifiers/rusentiment_convers_distilrubert_2L_torch.json
+++ b/deeppavlov/configs/classifiers/rusentiment_convers_distilrubert_2L.json
@@ -133,7 +133,7 @@
       "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
       "TRANSFORMER": "DeepPavlov/distilrubert-tiny-cased-conversational",
       "MODELS_PATH": "{ROOT_PATH}/models",
-      "MODEL_PATH": "{MODELS_PATH}/classifiers/rusentiment_convers_distilrubert_2L_torch"
+      "MODEL_PATH": "{MODELS_PATH}/classifiers/rusentiment_convers_distilrubert_2L"
     },
     "download": [
       {
@@ -145,7 +145,7 @@
         "subdir": "{DOWNLOADS_PATH}/rusentiment"
       },
       {
-        "url": "http://files.deeppavlov.ai/deeppavlov_data/classifiers/rusentiment_convers_distilrubert_2L_torch.tar.gz",
+        "url": "http://files.deeppavlov.ai/deeppavlov_data/classifiers/rusentiment_convers_distilrubert_2L.tar.gz",
         "subdir": "{MODELS_PATH}/classifiers/"
       }
     ]
diff --git a/deeppavlov/configs/classifiers/rusentiment_convers_distilrubert_6L_torch.json b/deeppavlov/configs/classifiers/rusentiment_convers_distilrubert_6L.json
similarity index 97%
rename from deeppavlov/configs/classifiers/rusentiment_convers_distilrubert_6L_torch.json
rename to deeppavlov/configs/classifiers/rusentiment_convers_distilrubert_6L.json
index 9d06ab2701..b211ebed0c 100644
--- a/deeppavlov/configs/classifiers/rusentiment_convers_distilrubert_6L_torch.json
+++ b/deeppavlov/configs/classifiers/rusentiment_convers_distilrubert_6L.json
@@ -133,7 +133,7 @@
       "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
       "TRANSFORMER": "DeepPavlov/distilrubert-base-cased-conversational",
       "MODELS_PATH": "{ROOT_PATH}/models",
-      "MODEL_PATH": "{MODELS_PATH}/classifiers/rusentiment_convers_distilrubert_6L_torch"
+      "MODEL_PATH": "{MODELS_PATH}/classifiers/rusentiment_convers_distilrubert_6L"
     },
     "download": [
       {
@@ -145,7 +145,7 @@
         "subdir": "{DOWNLOADS_PATH}/rusentiment"
       }, 
       {
-        "url": "http://files.deeppavlov.ai/deeppavlov_data/classifiers/rusentiment_convers_distilrubert_6L_torch.tar.gz",
+        "url": "http://files.deeppavlov.ai/deeppavlov_data/classifiers/rusentiment_convers_distilrubert_6L.tar.gz",
         "subdir": "{MODELS_PATH}/classifiers/"
       }
     ]
diff --git a/deeppavlov/configs/ner/ner_rus_convers_distilrubert_2L_torch.json b/deeppavlov/configs/ner/ner_rus_convers_distilrubert_2L.json
similarity index 98%
rename from deeppavlov/configs/ner/ner_rus_convers_distilrubert_2L_torch.json
rename to deeppavlov/configs/ner/ner_rus_convers_distilrubert_2L.json
index 147aef5cee..15c931c1eb 100644
--- a/deeppavlov/configs/ner/ner_rus_convers_distilrubert_2L_torch.json
+++ b/deeppavlov/configs/ner/ner_rus_convers_distilrubert_2L.json
@@ -142,12 +142,12 @@
       "ROOT_PATH": "~/.deeppavlov",
       "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
       "MODELS_PATH": "{ROOT_PATH}/models", 
-      "MODEL_PATH": "{MODELS_PATH}/ner_rus_conversational_distilrubert_2L_torch", 
+      "MODEL_PATH": "{MODELS_PATH}/ner_rus_conversational_distilrubert_2L",
       "TRANSFORMER": "DeepPavlov/distilrubert-tiny-cased-conversational"
     }, 
     "download": [
       {
-        "url": "http://files.deeppavlov.ai/v1/ner/ner_rus_conversational_distilrubert_2L_torch.tar.gz",
+        "url": "http://files.deeppavlov.ai/v1/ner/ner_rus_conversational_distilrubert_2L.tar.gz",
         "subdir": "{MODELS_PATH}"
       }
     ]
diff --git a/deeppavlov/configs/ner/ner_rus_convers_distilrubert_6L_torch.json b/deeppavlov/configs/ner/ner_rus_convers_distilrubert_6L.json
similarity index 98%
rename from deeppavlov/configs/ner/ner_rus_convers_distilrubert_6L_torch.json
rename to deeppavlov/configs/ner/ner_rus_convers_distilrubert_6L.json
index 0878c393e2..b2534426a6 100644
--- a/deeppavlov/configs/ner/ner_rus_convers_distilrubert_6L_torch.json
+++ b/deeppavlov/configs/ner/ner_rus_convers_distilrubert_6L.json
@@ -142,12 +142,12 @@
       "ROOT_PATH": "~/.deeppavlov",
       "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
       "MODELS_PATH": "{ROOT_PATH}/models", 
-      "MODEL_PATH": "{MODELS_PATH}/ner_rus_conversational_distilrubert_6L_torch", 
+      "MODEL_PATH": "{MODELS_PATH}/ner_rus_conversational_distilrubert_6L",
       "TRANSFORMER": "DeepPavlov/distilrubert-base-cased-conversational"
     }, 
     "download": [
       {
-        "url": "http://files.deeppavlov.ai/v1/ner/ner_rus_conversational_distilrubert_6L_torch.tar.gz",
+        "url": "http://files.deeppavlov.ai/v1/ner/ner_rus_conversational_distilrubert_6L.tar.gz",
         "subdir": "{MODELS_PATH}"
       }
     ]
diff --git a/deeppavlov/configs/squad/squad_ru_convers_distilrubert_2L_torch.json b/deeppavlov/configs/squad/squad_ru_convers_distilrubert_2L.json
similarity index 98%
rename from deeppavlov/configs/squad/squad_ru_convers_distilrubert_2L_torch.json
rename to deeppavlov/configs/squad/squad_ru_convers_distilrubert_2L.json
index 162a8f7013..f278ad9627 100644
--- a/deeppavlov/configs/squad/squad_ru_convers_distilrubert_2L_torch.json
+++ b/deeppavlov/configs/squad/squad_ru_convers_distilrubert_2L.json
@@ -161,11 +161,11 @@
       "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
       "TRANSFORMER": "DeepPavlov/distilrubert-tiny-cased-conversational",
       "MODELS_PATH": "{ROOT_PATH}/models",
-      "MODEL_PATH": "{MODELS_PATH}/squad_ru_convers_distilrubert_2L_torch"
+      "MODEL_PATH": "{MODELS_PATH}/squad_ru_convers_distilrubert_2L"
     },
     "download": [
       {
-        "url": "http://files.deeppavlov.ai/deeppavlov_data/squad_ru_convers_distilrubert_2L_torch.tar.gz",
+        "url": "http://files.deeppavlov.ai/deeppavlov_data/squad_ru_convers_distilrubert_2L.tar.gz",
         "subdir": "{MODELS_PATH}"
       }
     ]
diff --git a/deeppavlov/configs/squad/squad_ru_convers_distilrubert_2L_torch_infer.json b/deeppavlov/configs/squad/squad_ru_convers_distilrubert_2L_infer.json
similarity index 95%
rename from deeppavlov/configs/squad/squad_ru_convers_distilrubert_2L_torch_infer.json
rename to deeppavlov/configs/squad/squad_ru_convers_distilrubert_2L_infer.json
index b67331a2fc..9202d83ba8 100644
--- a/deeppavlov/configs/squad/squad_ru_convers_distilrubert_2L_torch_infer.json
+++ b/deeppavlov/configs/squad/squad_ru_convers_distilrubert_2L_infer.json
@@ -18,7 +18,7 @@
         "class_name": "torch_transformers_squad_infer",
         "lang": "ru", 
         "batch_size": 128,
-        "squad_model_config": "{CONFIGS_PATH}/squad/squad_ru_convers_distilrubert_2L_torch.json",
+        "squad_model_config": "{CONFIGS_PATH}/squad/squad_ru_convers_distilrubert_2L.json",
         "vocab_file": "{TRANSFORMER}",
         "do_lower_case": "{lowercase}",
         "max_seq_length": 256,
@@ -63,12 +63,12 @@
       "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
       "TRANSFORMER": "DeepPavlov/distilrubert-tiny-cased-conversational",
       "MODELS_PATH": "{ROOT_PATH}/models",
-      "MODEL_PATH": "{MODELS_PATH}/squad_ru_convers_distilrubert_2L_torch",
+      "MODEL_PATH": "{MODELS_PATH}/squad_ru_convers_distilrubert_2L",
       "CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs"
     },
     "download": [
       {
-        "url": "http://files.deeppavlov.ai/deeppavlov_data/squad_ru_convers_distilrubert_2L_torch.tar.gz",
+        "url": "http://files.deeppavlov.ai/deeppavlov_data/squad_ru_convers_distilrubert_2L.tar.gz",
         "subdir": "{MODELS_PATH}"
       }
     ]
diff --git a/deeppavlov/configs/squad/squad_ru_convers_distilrubert_6L_torch.json b/deeppavlov/configs/squad/squad_ru_convers_distilrubert_6L.json
similarity index 98%
rename from deeppavlov/configs/squad/squad_ru_convers_distilrubert_6L_torch.json
rename to deeppavlov/configs/squad/squad_ru_convers_distilrubert_6L.json
index 574702120c..8ca10a28f7 100644
--- a/deeppavlov/configs/squad/squad_ru_convers_distilrubert_6L_torch.json
+++ b/deeppavlov/configs/squad/squad_ru_convers_distilrubert_6L.json
@@ -161,11 +161,11 @@
       "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
       "TRANSFORMER": "DeepPavlov/distilrubert-base-cased-conversational",
       "MODELS_PATH": "{ROOT_PATH}/models",
-      "MODEL_PATH": "{MODELS_PATH}/squad_ru_convers_distilrubert_6L_torch"
+      "MODEL_PATH": "{MODELS_PATH}/squad_ru_convers_distilrubert_6L"
     },
     "download": [
       {
-        "url": "http://files.deeppavlov.ai/deeppavlov_data/squad_ru_convers_distilrubert_6L_torch.tar.gz",
+        "url": "http://files.deeppavlov.ai/deeppavlov_data/squad_ru_convers_distilrubert_6L.tar.gz",
         "subdir": "{MODELS_PATH}"
       }
     ]
diff --git a/deeppavlov/configs/squad/squad_ru_convers_distilrubert_6L_torch_infer.json b/deeppavlov/configs/squad/squad_ru_convers_distilrubert_6L_infer.json
similarity index 95%
rename from deeppavlov/configs/squad/squad_ru_convers_distilrubert_6L_torch_infer.json
rename to deeppavlov/configs/squad/squad_ru_convers_distilrubert_6L_infer.json
index dbe16c055d..5c6171311c 100644
--- a/deeppavlov/configs/squad/squad_ru_convers_distilrubert_6L_torch_infer.json
+++ b/deeppavlov/configs/squad/squad_ru_convers_distilrubert_6L_infer.json
@@ -18,7 +18,7 @@
         "class_name": "torch_transformers_squad_infer",
         "lang": "ru", 
         "batch_size": 128,
-        "squad_model_config": "{CONFIGS_PATH}/squad/squad_ru_convers_distilrubert_6L_torch.json",
+        "squad_model_config": "{CONFIGS_PATH}/squad/squad_ru_convers_distilrubert_6L.json",
         "vocab_file": "{TRANSFORMER}",
         "do_lower_case": "{lowercase}",
         "max_seq_length": 256,
@@ -63,12 +63,12 @@
       "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
       "TRANSFORMER": "DeepPavlov/distilrubert-base-cased-conversational",
       "MODELS_PATH": "{ROOT_PATH}/models",
-      "MODEL_PATH": "{MODELS_PATH}/squad_ru_convers_distilrubert_6L_torch",
+      "MODEL_PATH": "{MODELS_PATH}/squad_ru_convers_distilrubert_6L",
       "CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs"
     },
     "download": [
       {
-        "url": "http://files.deeppavlov.ai/deeppavlov_data/squad_ru_convers_distilrubert_6L_torch.tar.gz",
+        "url": "http://files.deeppavlov.ai/deeppavlov_data/squad_ru_convers_distilrubert_6L.tar.gz",
         "subdir": "{MODELS_PATH}"
       }
     ]
diff --git a/docs/features/overview.rst b/docs/features/overview.rst
index ab2015ed37..7a515a40cb 100644
--- a/docs/features/overview.rst
+++ b/docs/features/overview.rst
@@ -20,31 +20,31 @@ The second model reproduces architecture from the paper `Application
 of a Hybrid Bi-LSTM-CRF model to the task of Russian Named Entity Recognition <https://arxiv.org/pdf/1709.09686.pdf>`__
 which is inspired by Bi-LSTM+CRF architecture from https://arxiv.org/pdf/1603.01360.pdf.
 
-+---------------------------------------------------------+-------+--------------------------------------------------------------------------------------------------------+-------------+
-| Dataset                                                 | Lang  | Model                                                                                                  |   Test F1   |
-+=========================================================+=======+========================================================================================================+=============+
-| Persons-1000 dataset with additional LOC and ORG markup | Ru    | :config:`ner_rus_bert.json <ner/ner_rus_bert.json>`                                                    |    98.1     |
-+                                                         +       +--------------------------------------------------------------------------------------------------------+-------------+
-| (Collection 3)                                          |       | :config:`ner_rus.json <ner/ner_rus.json>`                                                              |    95.1     |
-+                                                         +       +--------------------------------------------------------------------------------------------------------+-------------+
-|                                                         |       | :config:`ner_rus_convers_distilrubert_2L_torch.json  <ner/ner_rus_convers_distilrubert_2L_torch.json>` |  88.4 ± 0.5 |                                                                                                                                                    
-+                                                         +       +--------------------------------------------------------------------------------------------------------+-------------+
-|                                                         |       | :config:`ner_rus_convers_distilrubert_6L_torch.json  <ner/ner_rus_convers_distilrubert_6L_torch.json>` |  93.3 ± 0.3 |                                                         
-+---------------------------------------------------------+-------+--------------------------------------------------------------------------------------------------------+-------------+
-| Ontonotes                                               | Multi | :config:`ner_ontonotes_bert_mult.json <ner/ner_ontonotes_bert_mult.json>`                              |    88.8     |
-+                                                         +-------+--------------------------------------------------------------------------------------------------------+-------------+
-|                                                         | En    | :config:`ner_ontonotes_bert.json <ner/ner_ontonotes_bert.json>`                                        |    88.6     |
-+                                                         +       +--------------------------------------------------------------------------------------------------------+-------------+
-|                                                         |       | :config:`ner_ontonotes.json <ner/ner_ontonotes.json>`                                                  |    87.1     |
-+---------------------------------------------------------+       +--------------------------------------------------------------------------------------------------------+-------------+
-| ConLL-2003                                              |       | :config:`ner_conll2003_bert.json <ner/ner_conll2003_bert.json>`                                        |    91.7     |
-+                                                         +       +--------------------------------------------------------------------------------------------------------+-------------+
-|                                                         |       | :config:`ner_conll2003_torch_bert.json <ner/ner_conll2003_torch_bert.json>`                            |    88.6     |
-+                                                         +       +--------------------------------------------------------------------------------------------------------+-------------+
-|                                                         |       | :config:`ner_conll2003.json <ner/ner_conll2003.json>`                                                  |    89.9     |
-+---------------------------------------------------------+       +--------------------------------------------------------------------------------------------------------+-------------+
-| DSTC2                                                   |       | :config:`ner_dstc2.json <ner/ner_dstc2.json>`                                                          |    97.1     |
-+---------------------------------------------------------+-------+--------------------------------------------------------------------------------------------------------+-------------+
++---------------------------------------------------------+-------+--------------------------------------------------------------------------------------------+-------------+
+| Dataset                                                 | Lang  | Model                                                                                      |   Test F1   |
++=========================================================+=======+============================================================================================+=============+
+| Persons-1000 dataset with additional LOC and ORG markup | Ru    | :config:`ner_rus_bert.json <ner/ner_rus_bert.json>`                                        |    98.1     |
++                                                         +       +--------------------------------------------------------------------------------------------+-------------+
+| (Collection 3)                                          |       | :config:`ner_rus.json <ner/ner_rus.json>`                                                  |    95.1     |
++                                                         +       +--------------------------------------------------------------------------------------------+-------------+
+|                                                         |       | :config:`ner_rus_convers_distilrubert_2L.json  <ner/ner_rus_convers_distilrubert_2L.json>` |  88.4 ± 0.5 |
++                                                         +       +--------------------------------------------------------------------------------------------+-------------+
+|                                                         |       | :config:`ner_rus_convers_distilrubert_6L.json  <ner/ner_rus_convers_distilrubert_6L.json>` |  93.3 ± 0.3 |
++---------------------------------------------------------+-------+--------------------------------------------------------------------------------------------+-------------+
+| Ontonotes                                               | Multi | :config:`ner_ontonotes_bert_mult.json <ner/ner_ontonotes_bert_mult.json>`                  |    88.8     |
++                                                         +-------+--------------------------------------------------------------------------------------------+-------------+
+|                                                         | En    | :config:`ner_ontonotes_bert.json <ner/ner_ontonotes_bert.json>`                            |    88.6     |
++                                                         +       +--------------------------------------------------------------------------------------------+-------------+
+|                                                         |       | :config:`ner_ontonotes.json <ner/ner_ontonotes.json>`                                      |    87.1     |
++---------------------------------------------------------+       +--------------------------------------------------------------------------------------------+-------------+
+| ConLL-2003                                              |       | :config:`ner_conll2003_bert.json <ner/ner_conll2003_bert.json>`                            |    91.7     |
++                                                         +       +--------------------------------------------------------------------------------------------+-------------+
+|                                                         |       | :config:`ner_conll2003_torch_bert.json <ner/ner_conll2003_torch_bert.json>`                |    88.6     |
++                                                         +       +--------------------------------------------------------------------------------------------+-------------+
+|                                                         |       | :config:`ner_conll2003.json <ner/ner_conll2003.json>`                                      |    89.9     |
++---------------------------------------------------------+       +--------------------------------------------------------------------------------------------+-------------+
+| DSTC2                                                   |       | :config:`ner_dstc2.json <ner/ner_dstc2.json>`                                              |    97.1     |
++---------------------------------------------------------+-------+--------------------------------------------------------------------------------------------+-------------+
 
 Slot filling models :doc:`[docs] </features/models/slot_filling>`
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -67,65 +67,65 @@ BiLSTM with self-attention and other models are presented. The model also allows
 Several pre-trained models are available and presented in Table below.
 
 
-+------------------+---------------------+------+----------------------------------------------------------------------------------------------------------+-------------+------------------+-----------------+-----------+
-| Task             | Dataset             | Lang | Model                                                                                                    | Metric      | Valid            | Test            | Downloads |
-+==================+=====================+======+==========================================================================================================+=============+==================+=================+===========+
-| 28 intents       | `DSTC 2`_           | En   | :config:`DSTC 2 emb <classifiers/intents_dstc2.json>`                                                    | Accuracy    | 0.7613           | 0.7733          |  800 Mb   |
-+                  +                     +      +----------------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
-|                  |                     |      | :config:`Wiki emb <classifiers/intents_dstc2_big.json>`                                                  |             | 0.9629           | 0.9617          |  8.5 Gb   |
-+                  +                     +      +----------------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
-|                  |                     |      | :config:`BERT <classifiers/intents_dstc2_bert.json>`                                                     |             | 0.9673           | 0.9636          |  800 Mb   |
-+------------------+---------------------+      +----------------------------------------------------------------------------------------------------------+-------------+------------------+-----------------+-----------+
-| 7 intents        | `SNIPS-2017`_ [1]_  |      | :config:`DSTC 2 emb <classifiers/intents_snips.json>`                                                    | F1-macro    | 0.8591           |    --           |  800 Mb   |
-+                  +                     +      +----------------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
-|                  |                     |      | :config:`Wiki emb <classifiers/intents_snips_big.json>`                                                  |             | 0.9820           |    --           |  8.5 Gb   |
-+                  +                     +      +----------------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
-|                  |                     |      | :config:`Tfidf + SelectKBest + PCA + Wiki emb <classifiers/intents_snips_sklearn.json>`                  |             | 0.9673           |    --           |  8.6 Gb   |
-+                  +                     +      +----------------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
-|                  |                     |      | :config:`Wiki emb weighted by Tfidf <classifiers/intents_snips_tfidf_weighted.json>`                     |             | 0.9786           |    --           |  8.5 Gb   |
-+------------------+---------------------+      +----------------------------------------------------------------------------------------------------------+-------------+------------------+-----------------+-----------+
-| Insult detection | `Insults`_          |      | :config:`Reddit emb <classifiers/insults_kaggle.json>`                                                   | ROC-AUC     | 0.9263           | 0.8556          |  6.2 Gb   |
-+                  +                     +      +----------------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
-|                  |                     |      | :config:`English BERT <classifiers/insults_kaggle_bert.json>`                                            |             | 0.9255           | 0.8612          |  1200 Mb  |
-+                  +                     +      +----------------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
-|                  |                     |      | :config:`English Conversational BERT <classifiers/insults_kaggle_conv_bert.json>`                        |             | 0.9389           | 0.8941          |  1200 Mb  |
-+                  +                     +      +----------------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
-|                  |                     |      | :config:`English BERT on PyTorch <classifiers/insults_kaggle_bert_torch.json>`                           |             | 0.9329           | 0.877           |  1.1 Gb   |
-+------------------+---------------------+      +----------------------------------------------------------------------------------------------------------+-------------+------------------+-----------------+-----------+
-| 5 topics         | `AG News`_          |      | :config:`Wiki emb <classifiers/topic_ag_news.json>`                                                      | Accuracy    | 0.8922           | 0.9059          |  8.5 Gb   |
-+------------------+---------------------+      +----------------------------------------------------------------------------------------------------------+-------------+------------------+-----------------+-----------+
-| Intent           | `Yahoo-L31`_        |      | :config:`Yahoo-L31 on conversational BERT <classifiers/yahoo_convers_vs_info_bert.json>`                 | ROC-AUC     | 0.9436           |   --            |  1200 Mb  |
-+------------------+---------------------+      +----------------------------------------------------------------------------------------------------------+-------------+------------------+-----------------+-----------+
-| Sentiment        | `SST`_              |      | :config:`5-classes SST on conversational BERT <classifiers/sentiment_sst_conv_bert.json>`                | Accuracy    | 0.6456           | 0.6715          |  400 Mb   |
-+                  +                     +      +----------------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
-|                  |                     |      | :config:`5-classes SST on multilingual BERT <classifiers/sentiment_sst_multi_bert.json>`                 |             | 0.5738           | 0.6024          |  660 Mb   |
-+                  +                     +      +----------------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
-|                  |                     |      | :config:`3-classes SST SWCNN on PyTorch <classifiers/sst_torch_swcnn.json>`                              |             | 0.7379           | 0.6312          |  4.3 Mb   |
-+                  +---------------------+      +----------------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
-|                  | `Yelp`_             |      | :config:`5-classes Yelp on conversational BERT <classifiers/sentiment_yelp_conv_bert.json>`              |             | 0.6925           | 0.6842          |  400 Mb   |
-+                  +                     +      +----------------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
-|                  |                     |      | :config:`5-classes Yelp on multilingual BERT <classifiers/sentiment_yelp_multi_bert.json>`               |             | 0.5896           | 0.5874          |  660 Mb   |
-+------------------+---------------------+------+----------------------------------------------------------------------------------------------------------+-------------+------------------+-----------------+-----------+
-| Sentiment        | `Twitter mokoron`_  | Ru   | :config:`RuWiki+Lenta emb w/o preprocessing <classifiers/sentiment_twitter.json>`                        |             | 0.9965           | 0.9961          |  6.2 Gb   |
-+                  +                     +      +----------------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
-|                  |                     |      | :config:`RuWiki+Lenta emb with preprocessing <classifiers/sentiment_twitter_preproc.json>`               |             | 0.7823           | 0.7759          |  6.2 Gb   |
-+                  +---------------------+      +----------------------------------------------------------------------------------------------------------+-------------+------------------+-----------------+-----------+
-|                  | `RuSentiment`_      |      | :config:`RuWiki+Lenta emb <classifiers/rusentiment_cnn.json>`                                            | F1-weighted | 0.6541           | 0.7016          |  6.2 Gb   |
-+                  +                     +      +----------------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
-|                  |                     |      | :config:`Twitter emb super-convergence <classifiers/rusentiment_bigru_superconv.json>` [2]_              |             | 0.7301           | 0.7576          |  3.4 Gb   |
-+                  +                     +      +----------------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
-|                  |                     |      | :config:`ELMo <classifiers/rusentiment_elmo_twitter_cnn.json>`                                           |             | 0.7519           | 0.7875          |  700 Mb   |
-+                  +                     +      +----------------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
-|                  |                     |      | :config:`Multi-language BERT <classifiers/rusentiment_bert.json>`                                        |             | 0.6809           | 0.7193          |  1900 Mb  |
-+                  +                     +      +----------------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
-|                  |                     |      | :config:`Conversational RuBERT <classifiers/rusentiment_convers_bert.json>`                              |             | 0.7548           | 0.7742          |  657 Mb   |
-+                  +                     +      +----------------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
-|                  |                     |      | :config:`Conversational DistilRuBERT-tiny <classifiers/rusentiment_convers_distilrubert_2L_torch.json>`  |             |  0.703 ± 0.0031  | 0.7348 ± 0.0028 |  690 Mb   |
-+                  +                     +      +----------------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
-|                  |                     |      | :config:`Conversational DistilRuBERT-base <classifiers/rusentiment_convers_distilrubert_6L_torch.json>`  |             |  0.7376 ± 0.0045 | 0.7645 ± 0.035  |  1.0 Gb   |
-+------------------+---------------------+      +----------------------------------------------------------------------------------------------------------+-------------+------------------+-----------------+-----------+
-| Intent           | Ru like`Yahoo-L31`_ |      | :config:`Conversational vs Informational on ELMo <classifiers/yahoo_convers_vs_info.json>`               | ROC-AUC     | 0.9412           |   --            |  700 Mb   |
-+------------------+---------------------+------+----------------------------------------------------------------------------------------------------------+-------------+------------------+-----------------+-----------+
++------------------+---------------------+------+----------------------------------------------------------------------------------------------------+-------------+------------------+-----------------+-----------+
+| Task             | Dataset             | Lang | Model                                                                                              | Metric      | Valid            | Test            | Downloads |
++==================+=====================+======+====================================================================================================+=============+==================+=================+===========+
+| 28 intents       | `DSTC 2`_           | En   | :config:`DSTC 2 emb <classifiers/intents_dstc2.json>`                                              | Accuracy    | 0.7613           | 0.7733          |  800 Mb   |
++                  +                     +      +----------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
+|                  |                     |      | :config:`Wiki emb <classifiers/intents_dstc2_big.json>`                                            |             | 0.9629           | 0.9617          |  8.5 Gb   |
++                  +                     +      +----------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
+|                  |                     |      | :config:`BERT <classifiers/intents_dstc2_bert.json>`                                               |             | 0.9673           | 0.9636          |  800 Mb   |
++------------------+---------------------+      +----------------------------------------------------------------------------------------------------+-------------+------------------+-----------------+-----------+
+| 7 intents        | `SNIPS-2017`_ [1]_  |      | :config:`DSTC 2 emb <classifiers/intents_snips.json>`                                              | F1-macro    | 0.8591           |    --           |  800 Mb   |
++                  +                     +      +----------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
+|                  |                     |      | :config:`Wiki emb <classifiers/intents_snips_big.json>`                                            |             | 0.9820           |    --           |  8.5 Gb   |
++                  +                     +      +----------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
+|                  |                     |      | :config:`Tfidf + SelectKBest + PCA + Wiki emb <classifiers/intents_snips_sklearn.json>`            |             | 0.9673           |    --           |  8.6 Gb   |
++                  +                     +      +----------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
+|                  |                     |      | :config:`Wiki emb weighted by Tfidf <classifiers/intents_snips_tfidf_weighted.json>`               |             | 0.9786           |    --           |  8.5 Gb   |
++------------------+---------------------+      +----------------------------------------------------------------------------------------------------+-------------+------------------+-----------------+-----------+
+| Insult detection | `Insults`_          |      | :config:`Reddit emb <classifiers/insults_kaggle.json>`                                             | ROC-AUC     | 0.9263           | 0.8556          |  6.2 Gb   |
++                  +                     +      +----------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
+|                  |                     |      | :config:`English BERT <classifiers/insults_kaggle_bert.json>`                                      |             | 0.9255           | 0.8612          |  1200 Mb  |
++                  +                     +      +----------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
+|                  |                     |      | :config:`English Conversational BERT <classifiers/insults_kaggle_conv_bert.json>`                  |             | 0.9389           | 0.8941          |  1200 Mb  |
++                  +                     +      +----------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
+|                  |                     |      | :config:`English BERT on PyTorch <classifiers/insults_kaggle_bert_torch.json>`                     |             | 0.9329           | 0.877           |  1.1 Gb   |
++------------------+---------------------+      +----------------------------------------------------------------------------------------------------+-------------+------------------+-----------------+-----------+
+| 5 topics         | `AG News`_          |      | :config:`Wiki emb <classifiers/topic_ag_news.json>`                                                | Accuracy    | 0.8922           | 0.9059          |  8.5 Gb   |
++------------------+---------------------+      +----------------------------------------------------------------------------------------------------+-------------+------------------+-----------------+-----------+
+| Intent           | `Yahoo-L31`_        |      | :config:`Yahoo-L31 on conversational BERT <classifiers/yahoo_convers_vs_info_bert.json>`           | ROC-AUC     | 0.9436           |   --            |  1200 Mb  |
++------------------+---------------------+      +----------------------------------------------------------------------------------------------------+-------------+------------------+-----------------+-----------+
+| Sentiment        | `SST`_              |      | :config:`5-classes SST on conversational BERT <classifiers/sentiment_sst_conv_bert.json>`          | Accuracy    | 0.6456           | 0.6715          |  400 Mb   |
++                  +                     +      +----------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
+|                  |                     |      | :config:`5-classes SST on multilingual BERT <classifiers/sentiment_sst_multi_bert.json>`           |             | 0.5738           | 0.6024          |  660 Mb   |
++                  +                     +      +----------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
+|                  |                     |      | :config:`3-classes SST SWCNN on PyTorch <classifiers/sst_torch_swcnn.json>`                        |             | 0.7379           | 0.6312          |  4.3 Mb   |
++                  +---------------------+      +----------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
+|                  | `Yelp`_             |      | :config:`5-classes Yelp on conversational BERT <classifiers/sentiment_yelp_conv_bert.json>`        |             | 0.6925           | 0.6842          |  400 Mb   |
++                  +                     +      +----------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
+|                  |                     |      | :config:`5-classes Yelp on multilingual BERT <classifiers/sentiment_yelp_multi_bert.json>`         |             | 0.5896           | 0.5874          |  660 Mb   |
++------------------+---------------------+------+----------------------------------------------------------------------------------------------------+-------------+------------------+-----------------+-----------+
+| Sentiment        | `Twitter mokoron`_  | Ru   | :config:`RuWiki+Lenta emb w/o preprocessing <classifiers/sentiment_twitter.json>`                  |             | 0.9965           | 0.9961          |  6.2 Gb   |
++                  +                     +      +----------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
+|                  |                     |      | :config:`RuWiki+Lenta emb with preprocessing <classifiers/sentiment_twitter_preproc.json>`         |             | 0.7823           | 0.7759          |  6.2 Gb   |
++                  +---------------------+      +----------------------------------------------------------------------------------------------------+-------------+------------------+-----------------+-----------+
+|                  | `RuSentiment`_      |      | :config:`RuWiki+Lenta emb <classifiers/rusentiment_cnn.json>`                                      | F1-weighted | 0.6541           | 0.7016          |  6.2 Gb   |
++                  +                     +      +----------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
+|                  |                     |      | :config:`Twitter emb super-convergence <classifiers/rusentiment_bigru_superconv.json>` [2]_        |             | 0.7301           | 0.7576          |  3.4 Gb   |
++                  +                     +      +----------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
+|                  |                     |      | :config:`ELMo <classifiers/rusentiment_elmo_twitter_cnn.json>`                                     |             | 0.7519           | 0.7875          |  700 Mb   |
++                  +                     +      +----------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
+|                  |                     |      | :config:`Multi-language BERT <classifiers/rusentiment_bert.json>`                                  |             | 0.6809           | 0.7193          |  1900 Mb  |
++                  +                     +      +----------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
+|                  |                     |      | :config:`Conversational RuBERT <classifiers/rusentiment_convers_bert.json>`                        |             | 0.7548           | 0.7742          |  657 Mb   |
++                  +                     +      +----------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
+|                  |                     |      | :config:`Conversational DistilRuBERT-tiny <classifiers/rusentiment_convers_distilrubert_2L.json>`  |             |  0.703 ± 0.0031  | 0.7348 ± 0.0028 |  690 Mb   |
++                  +                     +      +----------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
+|                  |                     |      | :config:`Conversational DistilRuBERT-base <classifiers/rusentiment_convers_distilrubert_6L.json>`  |             |  0.7376 ± 0.0045 | 0.7645 ± 0.035  |  1.0 Gb   |
++------------------+---------------------+      +----------------------------------------------------------------------------------------------------+-------------+------------------+-----------------+-----------+
+| Intent           | Ru like`Yahoo-L31`_ |      | :config:`Conversational vs Informational on ELMo <classifiers/yahoo_convers_vs_info.json>`         | ROC-AUC     | 0.9412           |   --            |  700 Mb   |
++------------------+---------------------+------+----------------------------------------------------------------------------------------------------+-------------+------------------+-----------------+-----------+
 
 .. [1] Coucke A. et al. Snips voice platform: an embedded spoken language understanding system for private-by-design voice interfaces //arXiv preprint arXiv:1805.10190. – 2018.
 .. [2] Smith L. N., Topin N. Super-convergence: Very fast training of residual networks using large learning rates. – 2018.
@@ -331,29 +331,29 @@ BERT-based model is described in  `BERT: Pre-training of Deep Bidirectional Tran
 R-Net model is based on `R-NET: Machine Reading Comprehension with Self-matching Networks
 <https://www.microsoft.com/en-us/research/publication/mcr/>`__.
 
-+---------------+---------------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
-|    Dataset    | Model config                                                                                      | lang  |    EM (dev)    |    F-1 (dev)    |    Downloads    |
-+===============+===================================================================================================+=======+================+=================+=================+
-| `SQuAD-v1.1`_ | :config:`DeepPavlov BERT <squad/squad_bert.json>`                                                 |  en   |     80.88      |     88.49       |     806Mb       |
-+---------------+---------------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
-| `SQuAD-v1.1`_ | :config:`DeepPavlov BERT on PyTorch <squad/squad_torch_bert.json>`                                |  en   |    80.79       |     88.30       |     1.1 Gb      |
-+---------------+---------------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
-| `SQuAD-v1.1`_ | :config:`DeepPavlov R-Net <squad/squad.json>`                                                     |  en   |     71.49      |     80.34       |     ~2.5Gb      |
-+---------------+---------------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
-|  SDSJ Task B  | :config:`DeepPavlov RuBERT <squad/squad_ru_bert_infer.json>`                                      |  ru   |  66.30 ± 0.24  |   84.60 ± 0.11  |     1325Mb      |
-+---------------+---------------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
-|  SDSJ Task B  | :config:`DeepPavlov multilingual BERT <squad/squad_ru_bert_infer.json>`                           |  ru   |  64.35 ± 0.39  |   83.39 ± 0.08  |     1323Mb      |
-+---------------+---------------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
-|  SDSJ Task B  | :config:`DeepPavlov R-Net <squad/squad_ru.json>`                                                  |  ru   |     60.62      |     80.04       |     ~5Gb        |
-+---------------+---------------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
-|  SDSJ Task B  | :config:`DeepPavlov DistilRuBERT-tiny <squad/squad_ru_convers_distilrubert_2L_torch_infer.json>`  |  ru   |  44.2 ± 0.46   |  65.1 ± 0.36    |     867Mb       |
-+---------------+---------------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
-|  SDSJ Task B  | :config:`DeepPavlov DistilRuBERT-base <squad/squad_ru_convers_distilrubert_6L_torch_infer.json>`  |  ru   |  61.23 ± 0.42  |  80.36 ± 0.28   |     1.18Gb      |
-+---------------+---------------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
-|    `DRCD`_    | :config:`DeepPavlov multilingual BERT <squad/squad_zh_bert_mult.json>`                            |  ch   |     84.86      |     89.03       |     630Mb       |
-+---------------+---------------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
-|    `DRCD`_    | :config:`DeepPavlov Chinese BERT <squad/squad_zh_bert_zh.json>`                                   |  ch   |     84.19      |     89.23       |     362Mb       |
-+---------------+---------------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
++----------------+---------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
+|    Dataset     | Model config                                                                                | lang  |    EM (dev)    |    F-1 (dev)    |    Downloads    |
++================+=============================================================================================+=======+================+=================+=================+
+| `SQuAD-v1.1`_  | :config:`DeepPavlov BERT <squad/squad_bert.json>`                                           |  en   |     80.88      |     88.49       |     806Mb       |
++----------------+---------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
+| `SQuAD-v1.1`_  | :config:`DeepPavlov BERT on PyTorch <squad/squad_torch_bert.json>`                          |  en   |    80.79       |     88.30       |     1.1 Gb      |
++----------------+---------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
+| `SQuAD-v1.1`_  | :config:`DeepPavlov R-Net <squad/squad.json>`                                               |  en   |     71.49      |     80.34       |     ~2.5Gb      |
++----------------+---------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
+| `SDSJ Task B`_ | :config:`DeepPavlov RuBERT <squad/squad_ru_bert_infer.json>`                                |  ru   |  66.30 ± 0.24  |   84.60 ± 0.11  |     1325Mb      |
++----------------+---------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
+| `SDSJ Task B`_ | :config:`DeepPavlov multilingual BERT <squad/squad_ru_bert_infer.json>`                     |  ru   |  64.35 ± 0.39  |   83.39 ± 0.08  |     1323Mb      |
++----------------+---------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
+| `SDSJ Task B`_ | :config:`DeepPavlov R-Net <squad/squad_ru.json>`                                            |  ru   |     60.62      |     80.04       |     ~5Gb        |
++----------------+---------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
+| `SDSJ Task B`_ | :config:`DeepPavlov DistilRuBERT-tiny <squad/squad_ru_convers_distilrubert_2L_infer.json>`  |  ru   |  44.2 ± 0.46   |  65.1 ± 0.36    |     867Mb       |
++----------------+---------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
+| `SDSJ Task B`_ | :config:`DeepPavlov DistilRuBERT-base <squad/squad_ru_convers_distilrubert_6L_infer.json>`  |  ru   |  61.23 ± 0.42  |  80.36 ± 0.28   |     1.18Gb      |
++----------------+---------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
+|    `DRCD`_     | :config:`DeepPavlov multilingual BERT <squad/squad_zh_bert_mult.json>`                      |  ch   |     84.86      |     89.03       |     630Mb       |
++----------------+---------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
+|    `DRCD`_     | :config:`DeepPavlov Chinese BERT <squad/squad_zh_bert_zh.json>`                             |  ch   |     84.19      |     89.23       |     362Mb       |
++----------------+---------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
 
 In the case when answer is not necessary present in given context we have :config:`squad_noans <squad/multi_squad_noans.json>`
 model. This model outputs empty string in case if there is no answer in context.
@@ -570,5 +570,5 @@ goal-oriented bot and a slot-filling model with Telegram UI.
 
 
 .. _`SQuAD-v1.1`: https://arxiv.org/abs/1606.05250
-.. _`SDSJ Task B`: https://sdsj.sberbank.ai/2017/ru/contest.html
+.. _`SDSJ Task B`: https://arxiv.org/abs/1912.09723
 .. _`DRCD`: https://arxiv.org/abs/1806.00920
diff --git a/tests/test_quick_start.py b/tests/test_quick_start.py
index 7b0dea2d9a..74ae65dedc 100644
--- a/tests/test_quick_start.py
+++ b/tests/test_quick_start.py
@@ -146,7 +146,15 @@
     },
     "distil": {
         ("classifiers/paraphraser_convers_distilrubert_2L.json", "distil", ('IP')): [TWO_ARGUMENTS_INFER_CHECK],
-        ("classifiers/paraphraser_convers_distilrubert_6L.json", "distil", ('IP')): [TWO_ARGUMENTS_INFER_CHECK]
+        ("classifiers/paraphraser_convers_distilrubert_6L.json", "distil", ('IP')): [TWO_ARGUMENTS_INFER_CHECK],
+        ("classifiers/rusentiment_convers_distilrubert_2L.json", "distil", ('IP')): [ONE_ARGUMENT_INFER_CHECK],
+        ("classifiers/rusentiment_convers_distilrubert_6L.json", "distil", ('IP')): [ONE_ARGUMENT_INFER_CHECK],
+        ("ner/ner_rus_convers_distilrubert_2L.json", "distil", ('IP')): [ONE_ARGUMENT_INFER_CHECK],
+        ("ner/ner_rus_convers_distilrubert_6L.json", "distil", ('IP')): [ONE_ARGUMENT_INFER_CHECK],
+        ("squad/squad_ru_convers_distilrubert_2L.json", "distil", ('IP')): [TWO_ARGUMENTS_INFER_CHECK],
+        ("squad/squad_ru_convers_distilrubert_2L_infer.json", "distil", ('IP')): [TWO_ARGUMENTS_INFER_CHECK],
+        ("squad/squad_ru_convers_distilrubert_6L.json", "distil", ('IP')): [TWO_ARGUMENTS_INFER_CHECK],
+        ("squad/squad_ru_convers_distilrubert_6L_infer.json", "distil", ('IP')): [TWO_ARGUMENTS_INFER_CHECK],
     },
     "entity_linking": {
         ("kbqa/entity_linking_rus.json", "entity_linking",  ('IP',)):

From f390f1b8f3828bfde7465ab25fd9db2e81cbfae4 Mon Sep 17 00:00:00 2001
From: Fedor Ignatov <ignatov.fedor@gmail.com>
Date: Fri, 30 Jul 2021 20:10:10 +0300
Subject: [PATCH 13/17] refactor: datasets for distil

---
 .../paraphraser_convers_distilrubert_2L.json         | 12 ++++++++++--
 .../paraphraser_convers_distilrubert_6L.json         | 12 ++++++++++--
 .../rusentiment_convers_distilrubert_2L.json         |  8 --------
 .../rusentiment_convers_distilrubert_6L.json         |  8 --------
 4 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/deeppavlov/configs/classifiers/paraphraser_convers_distilrubert_2L.json b/deeppavlov/configs/classifiers/paraphraser_convers_distilrubert_2L.json
index 5dcc8460b5..d0a2eee508 100644
--- a/deeppavlov/configs/classifiers/paraphraser_convers_distilrubert_2L.json
+++ b/deeppavlov/configs/classifiers/paraphraser_convers_distilrubert_2L.json
@@ -76,10 +76,18 @@
       "MODEL_PATH": "{MODELS_PATH}/paraphraser_convers_distilrubert_2L"
     },
     "download": [
-        {
+      {
         "url": "http://files.deeppavlov.ai/deeppavlov_data/classifiers/paraphraser_convers_distilrubert_2L.tar.gz",
         "subdir": "{MODELS_PATH}"
-        }
+      },
+      {
+        "url": "http://files.deeppavlov.ai/datasets/paraphraser.zip",
+        "subdir": "{DOWNLOADS_PATH}/paraphraser_data"
+      },
+      {
+        "url": "http://files.deeppavlov.ai/datasets/paraphraser_gold.zip",
+        "subdir": "{DOWNLOADS_PATH}/paraphraser_data"
+      }
     ]
   }
 }
diff --git a/deeppavlov/configs/classifiers/paraphraser_convers_distilrubert_6L.json b/deeppavlov/configs/classifiers/paraphraser_convers_distilrubert_6L.json
index f8a9e96921..c3f479ca07 100644
--- a/deeppavlov/configs/classifiers/paraphraser_convers_distilrubert_6L.json
+++ b/deeppavlov/configs/classifiers/paraphraser_convers_distilrubert_6L.json
@@ -76,10 +76,18 @@
       "MODEL_PATH": "{MODELS_PATH}/paraphraser_convers_distilrubert_6L"
     },
     "download": [
-        {
+      {
         "url": "http://files.deeppavlov.ai/deeppavlov_data/classifiers/paraphraser_convers_distilrubert_6L.tar.gz",
         "subdir": "{MODELS_PATH}"
-        }
+      },
+      {
+        "url": "http://files.deeppavlov.ai/datasets/paraphraser.zip",
+        "subdir": "{DOWNLOADS_PATH}/paraphraser_data"
+      },
+      {
+        "url": "http://files.deeppavlov.ai/datasets/paraphraser_gold.zip",
+        "subdir": "{DOWNLOADS_PATH}/paraphraser_data"
+      }
     ]
   }
 } 
diff --git a/deeppavlov/configs/classifiers/rusentiment_convers_distilrubert_2L.json b/deeppavlov/configs/classifiers/rusentiment_convers_distilrubert_2L.json
index ff6c6cdabc..8042987116 100644
--- a/deeppavlov/configs/classifiers/rusentiment_convers_distilrubert_2L.json
+++ b/deeppavlov/configs/classifiers/rusentiment_convers_distilrubert_2L.json
@@ -136,14 +136,6 @@
       "MODEL_PATH": "{MODELS_PATH}/classifiers/rusentiment_convers_distilrubert_2L"
     },
     "download": [
-      {
-        "url": "https://raw.githubusercontent.com/strawberrypie/rusentiment/master/Dataset/rusentiment_random_posts.csv",
-        "subdir": "{DOWNLOADS_PATH}/rusentiment"
-      },
-      {
-        "url": "https://raw.githubusercontent.com/strawberrypie/rusentiment/master/Dataset/rusentiment_test.csv",
-        "subdir": "{DOWNLOADS_PATH}/rusentiment"
-      },
       {
         "url": "http://files.deeppavlov.ai/deeppavlov_data/classifiers/rusentiment_convers_distilrubert_2L.tar.gz",
         "subdir": "{MODELS_PATH}/classifiers/"
diff --git a/deeppavlov/configs/classifiers/rusentiment_convers_distilrubert_6L.json b/deeppavlov/configs/classifiers/rusentiment_convers_distilrubert_6L.json
index b211ebed0c..c02f44938a 100644
--- a/deeppavlov/configs/classifiers/rusentiment_convers_distilrubert_6L.json
+++ b/deeppavlov/configs/classifiers/rusentiment_convers_distilrubert_6L.json
@@ -136,14 +136,6 @@
       "MODEL_PATH": "{MODELS_PATH}/classifiers/rusentiment_convers_distilrubert_6L"
     },
     "download": [
-      {
-        "url": "https://raw.githubusercontent.com/strawberrypie/rusentiment/master/Dataset/rusentiment_random_posts.csv",
-        "subdir": "{DOWNLOADS_PATH}/rusentiment"
-      },
-      {
-        "url": "https://raw.githubusercontent.com/strawberrypie/rusentiment/master/Dataset/rusentiment_test.csv",
-        "subdir": "{DOWNLOADS_PATH}/rusentiment"
-      }, 
       {
         "url": "http://files.deeppavlov.ai/deeppavlov_data/classifiers/rusentiment_convers_distilrubert_6L.tar.gz",
         "subdir": "{MODELS_PATH}/classifiers/"

From f99ce48e5c28041aa9c068210fbd1a6b5fd7aaf0 Mon Sep 17 00:00:00 2001
From: ayeffkay <chatton1@mail.ru>
Date: Fri, 24 Sep 2021 18:51:43 +0300
Subject: [PATCH 14/17] Distilrubert-tiny configs update

---
 .../paraphraser_convers_distilrubert_2L.json  |  6 ++--
 .../rusentiment_convers_distilrubert_2L.json  |  6 ++--
 .../ner/ner_rus_convers_distilrubert_2L.json  |  4 +--
 .../squad_ru_convers_distilrubert_2L.json     |  4 +--
 docs/features/overview.rst                    | 32 +++++++++----------
 5 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/deeppavlov/configs/classifiers/paraphraser_convers_distilrubert_2L.json b/deeppavlov/configs/classifiers/paraphraser_convers_distilrubert_2L.json
index d0a2eee508..f35ebc134a 100644
--- a/deeppavlov/configs/classifiers/paraphraser_convers_distilrubert_2L.json
+++ b/deeppavlov/configs/classifiers/paraphraser_convers_distilrubert_2L.json
@@ -28,11 +28,11 @@
         "pretrained_bert": "{TRANSFORMER}",
         "save_path": "{MODEL_PATH}/model",
         "load_path": "{MODEL_PATH}/model",
-        "attention_probs_keep_prob": 0.11,
-        "hidden_keep_prob": 1.0, 
+        "attention_probs_keep_prob": 0.67,
+        "hidden_keep_prob": 0.0, 
         "optimizer": "AdamW",
         "optimizer_parameters": {
-          "lr": 1.89e-05
+          "lr": 9e-05
         },
         "learning_rate_drop_patience": 3,
         "learning_rate_drop_div": 1.5,
diff --git a/deeppavlov/configs/classifiers/rusentiment_convers_distilrubert_2L.json b/deeppavlov/configs/classifiers/rusentiment_convers_distilrubert_2L.json
index 8042987116..42d0c72fc4 100644
--- a/deeppavlov/configs/classifiers/rusentiment_convers_distilrubert_2L.json
+++ b/deeppavlov/configs/classifiers/rusentiment_convers_distilrubert_2L.json
@@ -66,11 +66,11 @@
         "pretrained_bert": "{TRANSFORMER}",
         "save_path": "{MODEL_PATH}/model",
         "load_path": "{MODEL_PATH}/model",
-        "attention_probs_keep_prob": 0.78,
-        "hidden_keep_prob": 0.89, 
+        "attention_probs_keep_prob": 0.33,
+        "hidden_keep_prob": 0.67, 
         "optimizer": "AdamW",
         "optimizer_parameters": {
-          "lr": 7.22e-05
+          "lr": 3.67e-05
         },
         "learning_rate_drop_patience": 5,
         "learning_rate_drop_div": 1.5,
diff --git a/deeppavlov/configs/ner/ner_rus_convers_distilrubert_2L.json b/deeppavlov/configs/ner/ner_rus_convers_distilrubert_2L.json
index 15c931c1eb..6123c18138 100644
--- a/deeppavlov/configs/ner/ner_rus_convers_distilrubert_2L.json
+++ b/deeppavlov/configs/ner/ner_rus_convers_distilrubert_2L.json
@@ -58,14 +58,14 @@
         "n_tags": "#tag_vocab.len",
         "pretrained_bert": "{TRANSFORMER}",
         "attention_probs_keep_prob": 0.11,
-        "hidden_keep_prob": 0.11, 
+        "hidden_keep_prob": 0.67, 
         "return_probas": false,
         "encoder_layer_ids": [
           -1
         ],
         "optimizer": "AdamW",
         "optimizer_parameters": {
-          "lr": 5.45e-05,
+          "lr": 8.11e-05,
           "weight_decay": 1e-06,
           "betas": [
             0.9,
diff --git a/deeppavlov/configs/squad/squad_ru_convers_distilrubert_2L.json b/deeppavlov/configs/squad/squad_ru_convers_distilrubert_2L.json
index f278ad9627..830ded55f6 100644
--- a/deeppavlov/configs/squad/squad_ru_convers_distilrubert_2L.json
+++ b/deeppavlov/configs/squad/squad_ru_convers_distilrubert_2L.json
@@ -68,8 +68,8 @@
         "pretrained_bert": "{TRANSFORMER}",
         "save_path": "{MODEL_PATH}/model",
         "load_path": "{MODEL_PATH}/model",
-        "attention_probs_keep_prob": 0.11,
-        "hidden_keep_prob": 0.33, 
+        "attention_probs_keep_prob": 0.0,
+        "hidden_keep_prob": 0.11, 
         "optimizer": "AdamW",
         "optimizer_parameters": {
           "lr": 9e-05
diff --git a/docs/features/overview.rst b/docs/features/overview.rst
index 7a515a40cb..724e7c6fd7 100644
--- a/docs/features/overview.rst
+++ b/docs/features/overview.rst
@@ -27,7 +27,7 @@ which is inspired by Bi-LSTM+CRF architecture from https://arxiv.org/pdf/1603.01
 +                                                         +       +--------------------------------------------------------------------------------------------+-------------+
 | (Collection 3)                                          |       | :config:`ner_rus.json <ner/ner_rus.json>`                                                  |    95.1     |
 +                                                         +       +--------------------------------------------------------------------------------------------+-------------+
-|                                                         |       | :config:`ner_rus_convers_distilrubert_2L.json  <ner/ner_rus_convers_distilrubert_2L.json>` |  88.4 ± 0.5 |
+|                                                         |       | :config:`ner_rus_convers_distilrubert_2L.json  <ner/ner_rus_convers_distilrubert_2L.json>` |  94.2 ± 0.2 |
 +                                                         +       +--------------------------------------------------------------------------------------------+-------------+
 |                                                         |       | :config:`ner_rus_convers_distilrubert_6L.json  <ner/ner_rus_convers_distilrubert_6L.json>` |  93.3 ± 0.3 |
 +---------------------------------------------------------+-------+--------------------------------------------------------------------------------------------+-------------+
@@ -120,7 +120,7 @@ Several pre-trained models are available and presented in Table below.
 +                  +                     +      +----------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
 |                  |                     |      | :config:`Conversational RuBERT <classifiers/rusentiment_convers_bert.json>`                        |             | 0.7548           | 0.7742          |  657 Mb   |
 +                  +                     +      +----------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
-|                  |                     |      | :config:`Conversational DistilRuBERT-tiny <classifiers/rusentiment_convers_distilrubert_2L.json>`  |             |  0.703 ± 0.0031  | 0.7348 ± 0.0028 |  690 Mb   |
+|                  |                     |      | :config:`Conversational DistilRuBERT-tiny <classifiers/rusentiment_convers_distilrubert_2L.json>`  |             |  0.72 ± 0.0016   | 0.7458 ± 0.0098 |  690 Mb   |
 +                  +                     +      +----------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
 |                  |                     |      | :config:`Conversational DistilRuBERT-base <classifiers/rusentiment_convers_distilrubert_6L.json>`  |             |  0.7376 ± 0.0045 | 0.7645 ± 0.035  |  1.0 Gb   |
 +------------------+---------------------+      +----------------------------------------------------------------------------------------------------+-------------+------------------+-----------------+-----------+
@@ -255,19 +255,19 @@ Available pre-trained models for paraphrase identification:
 .. table::
    :widths: auto
 
-   +------------------------+------------------------------------------------------------------------------------------------------+----------------+-----------------+------------+------------+----------------+-----------------+-----------+
-   |    Dataset             | Model config                                                                                         | Val (accuracy) | Test (accuracy) | Val (F1)   | Test (F1)  | Val (log_loss) | Test (log_loss) | Downloads |
-   +========================+======================================================================================================+================+=================+============+============+================+=================+===========+
-   | `paraphraser.ru`_      | :config:`paraphrase_ident_paraphraser_ft <ranking/paraphrase_ident_paraphraser_interact.json>`       |   83.8         |   75.4          |   87.9     |  80.9      |   0.468        |   0.616         | 5938M     |
-   +------------------------+------------------------------------------------------------------------------------------------------+----------------+-----------------+------------+------------+----------------+-----------------+-----------+
-   | `paraphraser.ru`_      | :config:`paraphrase_bert_multilingual <classifiers/paraphraser_bert.json>`                           |   87.4         |   79.3          |   90.2     |  83.4      |   --           |   --            | 1330M     |
-   +------------------------+------------------------------------------------------------------------------------------------------+----------------+-----------------+------------+------------+----------------+-----------------+-----------+
-   | `paraphraser.ru`_      | :config:`paraphrase_rubert <classifiers/paraphraser_rubert.json>`                                    |   90.2         |   84.9          |   92.3     |  87.9      |   --           |   --            | 1325M     |
-   +------------------------+------------------------------------------------------------------------------------------------------+----------------+-----------------+------------+------------+----------------+-----------------+-----------+
-   | `paraphraser.ru`_      | :config:`paraphraser_convers_distilrubert_2L <classifiers/paraphraser_convers_distilrubert_2L.json>` |  76.1 ± 0.2    |  64.5 ± 0.5     | 81.8 ± 0.2 | 73.9 ± 0.8 |   --           |   --            | 618M      |
-   +------------------------+------------------------------------------------------------------------------------------------------+----------------+-----------------+------------+------------+----------------+-----------------+-----------+
-   | `paraphraser.ru`_      | :config:`paraphraser_convers_distilrubert_6L <classifiers/paraphraser_convers_distilrubert_6L.json>` |  86.5 ± 0.5    |  78.9 ± 0.4     | 89.6 ± 0.3 | 83.2 ± 0.5 |   --           |   --            | 930M      |
-   +------------------------+------------------------------------------------------------------------------------------------------+----------------+-----------------+------------+------------+----------------+-----------------+-----------+
+   +------------------------+------------------------------------------------------------------------------------------------------+----------------+-----------------+--------------+--------------+----------------+-----------------+-----------+
+   |    Dataset             | Model config                                                                                         | Val (accuracy) | Test (accuracy) | Val (F1)     | Test (F1)    | Val (log_loss) | Test (log_loss) | Downloads |
+   +========================+======================================================================================================+================+=================+==============+==============+================+=================+===========+
+   | `paraphraser.ru`_      | :config:`paraphrase_ident_paraphraser_ft <ranking/paraphrase_ident_paraphraser_interact.json>`       |   83.8         |   75.4          |   87.9       |  80.9        |   0.468        |   0.616         | 5938M     |
+   +------------------------+------------------------------------------------------------------------------------------------------+----------------+-----------------+--------------+--------------+----------------+-----------------+-----------+
+   | `paraphraser.ru`_      | :config:`paraphrase_bert_multilingual <classifiers/paraphraser_bert.json>`                           |   87.4         |   79.3          |   90.2       |  83.4        |   --           |   --            | 1330M     |
+   +------------------------+------------------------------------------------------------------------------------------------------+----------------+-----------------+--------------+--------------+----------------+-----------------+-----------+
+   | `paraphraser.ru`_      | :config:`paraphrase_rubert <classifiers/paraphraser_rubert.json>`                                    |   90.2         |   84.9          |   92.3       |  87.9        |   --           |   --            | 1325M     |
+   +------------------------+------------------------------------------------------------------------------------------------------+----------------+-----------------+--------------+--------------+----------------+-----------------+-----------+
+   | `paraphraser.ru`_      | :config:`paraphraser_convers_distilrubert_2L <classifiers/paraphraser_convers_distilrubert_2L.json>` |  79.4 ± 0.01   |  67.5 ± 0.006   | 84.4 ± 0.04  | 76.2 ± 0.006 |   --           |   --            | 618M      |
+   +------------------------+------------------------------------------------------------------------------------------------------+----------------+-----------------+--------------+--------------+----------------+-----------------+-----------+
+   | `paraphraser.ru`_      | :config:`paraphraser_convers_distilrubert_6L <classifiers/paraphraser_convers_distilrubert_6L.json>` |  86.5 ± 0.5    |  78.9 ± 0.4     | 89.6 ± 0.3   | 83.2 ± 0.5   |   --           |   --            | 930M      |
+   +------------------------+------------------------------------------------------------------------------------------------------+----------------+-----------------+--------------+--------------+----------------+-----------------+-----------+
 
 .. _`paraphraser.ru`: https://paraphraser.ru/
 
@@ -346,7 +346,7 @@ R-Net model is based on `R-NET: Machine Reading Comprehension with Self-matching
 +----------------+---------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
 | `SDSJ Task B`_ | :config:`DeepPavlov R-Net <squad/squad_ru.json>`                                            |  ru   |     60.62      |     80.04       |     ~5Gb        |
 +----------------+---------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
-| `SDSJ Task B`_ | :config:`DeepPavlov DistilRuBERT-tiny <squad/squad_ru_convers_distilrubert_2L_infer.json>`  |  ru   |  44.2 ± 0.46   |  65.1 ± 0.36    |     867Mb       |
+| `SDSJ Task B`_ | :config:`DeepPavlov DistilRuBERT-tiny <squad/squad_ru_convers_distilrubert_2L_infer.json>`  |  ru   |  48.3 ± 0.41   |  68.9 ± 0.39    |     867Mb       |
 +----------------+---------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
 | `SDSJ Task B`_ | :config:`DeepPavlov DistilRuBERT-base <squad/squad_ru_convers_distilrubert_6L_infer.json>`  |  ru   |  61.23 ± 0.42  |  80.36 ± 0.28   |     1.18Gb      |
 +----------------+---------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+

From 3da37c461a7f9dae8571e08e1d93214a1a63fd2d Mon Sep 17 00:00:00 2001
From: ayeffkay <chatton1@mail.ru>
Date: Wed, 3 Nov 2021 10:54:29 +0300
Subject: [PATCH 15/17] Distilrubert_base: configs updated

---
 .../classifiers/paraphraser_convers_distilrubert_6L.json    | 6 +++---
 .../classifiers/rusentiment_convers_distilrubert_6L.json    | 4 ++--
 deeppavlov/configs/ner/ner_rus_convers_distilrubert_6L.json | 4 ++--
 .../configs/squad/squad_ru_convers_distilrubert_6L.json     | 4 ++--
 4 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/deeppavlov/configs/classifiers/paraphraser_convers_distilrubert_6L.json b/deeppavlov/configs/classifiers/paraphraser_convers_distilrubert_6L.json
index c3f479ca07..02060d97ea 100644
--- a/deeppavlov/configs/classifiers/paraphraser_convers_distilrubert_6L.json
+++ b/deeppavlov/configs/classifiers/paraphraser_convers_distilrubert_6L.json
@@ -28,11 +28,11 @@
         "pretrained_bert": "{TRANSFORMER}",
         "save_path": "{MODEL_PATH}/model",
         "load_path": "{MODEL_PATH}/model",
-        "attention_probs_keep_prob": 0.0,
-        "hidden_keep_prob": 0.67, 
+        "attention_probs_keep_prob": 0.89,
+        "hidden_keep_prob": 0.44, 
         "optimizer": "AdamW",
         "optimizer_parameters": {
-          "lr": 7.22e-05
+          "lr": 5.46e-05
         },
         "learning_rate_drop_patience": 3,
         "learning_rate_drop_div": 1.5,
diff --git a/deeppavlov/configs/classifiers/rusentiment_convers_distilrubert_6L.json b/deeppavlov/configs/classifiers/rusentiment_convers_distilrubert_6L.json
index c02f44938a..f81488dbbb 100644
--- a/deeppavlov/configs/classifiers/rusentiment_convers_distilrubert_6L.json
+++ b/deeppavlov/configs/classifiers/rusentiment_convers_distilrubert_6L.json
@@ -66,8 +66,8 @@
         "pretrained_bert": "{TRANSFORMER}",
         "save_path": "{MODEL_PATH}/model",
         "load_path": "{MODEL_PATH}/model",
-        "attention_probs_keep_prob": 0.78,
-        "hidden_keep_prob": 0, 
+        "attention_probs_keep_prob": 0.22,
+        "hidden_keep_prob": 0.22, 
         "optimizer": "AdamW",
         "optimizer_parameters": {
           "lr": 4.56e-05
diff --git a/deeppavlov/configs/ner/ner_rus_convers_distilrubert_6L.json b/deeppavlov/configs/ner/ner_rus_convers_distilrubert_6L.json
index b2534426a6..b9cf79ab5e 100644
--- a/deeppavlov/configs/ner/ner_rus_convers_distilrubert_6L.json
+++ b/deeppavlov/configs/ner/ner_rus_convers_distilrubert_6L.json
@@ -57,8 +57,8 @@
         "class_name": "torch_transformers_sequence_tagger",
         "n_tags": "#tag_vocab.len",
         "pretrained_bert": "{TRANSFORMER}",
-        "attention_probs_keep_prob": 0.44,
-        "hidden_keep_prob": 0.89, 
+        "attention_probs_keep_prob": 0.0,
+        "hidden_keep_prob": 1.0, 
         "return_probas": false,
         "encoder_layer_ids": [
           -1
diff --git a/deeppavlov/configs/squad/squad_ru_convers_distilrubert_6L.json b/deeppavlov/configs/squad/squad_ru_convers_distilrubert_6L.json
index 8ca10a28f7..2017ef8d8e 100644
--- a/deeppavlov/configs/squad/squad_ru_convers_distilrubert_6L.json
+++ b/deeppavlov/configs/squad/squad_ru_convers_distilrubert_6L.json
@@ -69,10 +69,10 @@
         "save_path": "{MODEL_PATH}/model",
         "load_path": "{MODEL_PATH}/model",
         "attention_probs_keep_prob": 0.0,
-        "hidden_keep_prob": 0.33, 
+        "hidden_keep_prob": 1.0, 
         "optimizer": "AdamW",
         "optimizer_parameters": {
-          "lr": 3.67e-5
+          "lr": 2.78e-05
         },
         "learning_rate_drop_patience": 2,
         "learning_rate_drop_div": 1.5,

From d7618ee9abb06294887ee883c484f3a9974885a8 Mon Sep 17 00:00:00 2001
From: ayeffkay <chatton1@mail.ru>
Date: Wed, 3 Nov 2021 11:20:50 +0300
Subject: [PATCH 16/17] Scores updated

---
 docs/features/overview.rst | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/docs/features/overview.rst b/docs/features/overview.rst
index 724e7c6fd7..376b780cca 100644
--- a/docs/features/overview.rst
+++ b/docs/features/overview.rst
@@ -29,7 +29,7 @@ which is inspired by Bi-LSTM+CRF architecture from https://arxiv.org/pdf/1603.01
 +                                                         +       +--------------------------------------------------------------------------------------------+-------------+
 |                                                         |       | :config:`ner_rus_convers_distilrubert_2L.json  <ner/ner_rus_convers_distilrubert_2L.json>` |  94.2 ± 0.2 |
 +                                                         +       +--------------------------------------------------------------------------------------------+-------------+
-|                                                         |       | :config:`ner_rus_convers_distilrubert_6L.json  <ner/ner_rus_convers_distilrubert_6L.json>` |  93.3 ± 0.3 |
+|                                                         |       | :config:`ner_rus_convers_distilrubert_6L.json  <ner/ner_rus_convers_distilrubert_6L.json>` |  96.4 ± 0.2 |
 +---------------------------------------------------------+-------+--------------------------------------------------------------------------------------------+-------------+
 | Ontonotes                                               | Multi | :config:`ner_ontonotes_bert_mult.json <ner/ner_ontonotes_bert_mult.json>`                  |    88.8     |
 +                                                         +-------+--------------------------------------------------------------------------------------------+-------------+
@@ -120,9 +120,9 @@ Several pre-trained models are available and presented in Table below.
 +                  +                     +      +----------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
 |                  |                     |      | :config:`Conversational RuBERT <classifiers/rusentiment_convers_bert.json>`                        |             | 0.7548           | 0.7742          |  657 Mb   |
 +                  +                     +      +----------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
-|                  |                     |      | :config:`Conversational DistilRuBERT-tiny <classifiers/rusentiment_convers_distilrubert_2L.json>`  |             |  0.72 ± 0.0016   | 0.7458 ± 0.0098 |  690 Mb   |
+|                  |                     |      | :config:`Conversational DistilRuBERT-tiny <classifiers/rusentiment_convers_distilrubert_2L.json>`  |             | 0.72 ± 0.0016    | 0.74 ± 0.01     |  690 Mb   |
 +                  +                     +      +----------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
-|                  |                     |      | :config:`Conversational DistilRuBERT-base <classifiers/rusentiment_convers_distilrubert_6L.json>`  |             |  0.7376 ± 0.0045 | 0.7645 ± 0.035  |  1.0 Gb   |
+|                  |                     |      | :config:`Conversational DistilRuBERT-base <classifiers/rusentiment_convers_distilrubert_6L.json>`  |             | 0.73 ± 0.003     | 0.75 ± 0.013    |  1.0 Gb   |
 +------------------+---------------------+      +----------------------------------------------------------------------------------------------------+-------------+------------------+-----------------+-----------+
 | Intent           | Ru like`Yahoo-L31`_ |      | :config:`Conversational vs Informational on ELMo <classifiers/yahoo_convers_vs_info.json>`         | ROC-AUC     | 0.9412           |   --            |  700 Mb   |
 +------------------+---------------------+------+----------------------------------------------------------------------------------------------------+-------------+------------------+-----------------+-----------+
@@ -266,7 +266,7 @@ Available pre-trained models for paraphrase identification:
    +------------------------+------------------------------------------------------------------------------------------------------+----------------+-----------------+--------------+--------------+----------------+-----------------+-----------+
    | `paraphraser.ru`_      | :config:`paraphraser_convers_distilrubert_2L <classifiers/paraphraser_convers_distilrubert_2L.json>` |  79.4 ± 0.01   |  67.5 ± 0.006   | 84.4 ± 0.04  | 76.2 ± 0.006 |   --           |   --            | 618M      |
    +------------------------+------------------------------------------------------------------------------------------------------+----------------+-----------------+--------------+--------------+----------------+-----------------+-----------+
-   | `paraphraser.ru`_      | :config:`paraphraser_convers_distilrubert_6L <classifiers/paraphraser_convers_distilrubert_6L.json>` |  86.5 ± 0.5    |  78.9 ± 0.4     | 89.6 ± 0.3   | 83.2 ± 0.5   |   --           |   --            | 930M      |
+   | `paraphraser.ru`_      | :config:`paraphraser_convers_distilrubert_6L <classifiers/paraphraser_convers_distilrubert_6L.json>` |  87.1 ± 0.01   |  78.0 ± 0.01    | 90.0 ± 0.08  | 82.9 ± 0.003 |   --           |   --            | 930M      |
    +------------------------+------------------------------------------------------------------------------------------------------+----------------+-----------------+--------------+--------------+----------------+-----------------+-----------+
 
 .. _`paraphraser.ru`: https://paraphraser.ru/
@@ -348,7 +348,7 @@ R-Net model is based on `R-NET: Machine Reading Comprehension with Self-matching
 +----------------+---------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
 | `SDSJ Task B`_ | :config:`DeepPavlov DistilRuBERT-tiny <squad/squad_ru_convers_distilrubert_2L_infer.json>`  |  ru   |  48.3 ± 0.41   |  68.9 ± 0.39    |     867Mb       |
 +----------------+---------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
-| `SDSJ Task B`_ | :config:`DeepPavlov DistilRuBERT-base <squad/squad_ru_convers_distilrubert_6L_infer.json>`  |  ru   |  61.23 ± 0.42  |  80.36 ± 0.28   |     1.18Gb      |
+| `SDSJ Task B`_ | :config:`DeepPavlov DistilRuBERT-base <squad/squad_ru_convers_distilrubert_6L_infer.json>`  |  ru   |  61.77 ± 0.25  |  80.39 ± 0.21   |     1.18Gb      |
 +----------------+---------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
 |    `DRCD`_     | :config:`DeepPavlov multilingual BERT <squad/squad_zh_bert_mult.json>`                      |  ch   |     84.86      |     89.03       |     630Mb       |
 +----------------+---------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+

From f1118960c0c28052d5143c5a22d6ca4776e670d5 Mon Sep 17 00:00:00 2001
From: ayeffkay <chatton1@mail.ru>
Date: Thu, 4 Nov 2021 12:21:42 +0300
Subject: [PATCH 17/17] Hyperparams changed for distil- ner and squad

---
 deeppavlov/configs/ner/ner_rus_convers_distilrubert_6L.json   | 4 ++--
 .../configs/squad/squad_ru_convers_distilrubert_6L.json       | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/deeppavlov/configs/ner/ner_rus_convers_distilrubert_6L.json b/deeppavlov/configs/ner/ner_rus_convers_distilrubert_6L.json
index b9cf79ab5e..f719065d58 100644
--- a/deeppavlov/configs/ner/ner_rus_convers_distilrubert_6L.json
+++ b/deeppavlov/configs/ner/ner_rus_convers_distilrubert_6L.json
@@ -1,4 +1,4 @@
- {
+{
   "dataset_reader": {
     "class_name": "conll2003_reader",
     "data_path": "{DOWNLOADS_PATH}/total_rus/",
@@ -57,7 +57,7 @@
         "class_name": "torch_transformers_sequence_tagger",
         "n_tags": "#tag_vocab.len",
         "pretrained_bert": "{TRANSFORMER}",
-        "attention_probs_keep_prob": 0.0,
+        "attention_probs_keep_prob": 0.56,
         "hidden_keep_prob": 1.0, 
         "return_probas": false,
         "encoder_layer_ids": [
diff --git a/deeppavlov/configs/squad/squad_ru_convers_distilrubert_6L.json b/deeppavlov/configs/squad/squad_ru_convers_distilrubert_6L.json
index 2017ef8d8e..58e815cc77 100644
--- a/deeppavlov/configs/squad/squad_ru_convers_distilrubert_6L.json
+++ b/deeppavlov/configs/squad/squad_ru_convers_distilrubert_6L.json
@@ -68,8 +68,8 @@
         "pretrained_bert": "{TRANSFORMER}",
         "save_path": "{MODEL_PATH}/model",
         "load_path": "{MODEL_PATH}/model",
-        "attention_probs_keep_prob": 0.0,
-        "hidden_keep_prob": 1.0, 
+        "attention_probs_keep_prob": 0.45,
+        "hidden_keep_prob": 0.56, 
         "optimizer": "AdamW",
         "optimizer_parameters": {
           "lr": 2.78e-05